From 1f69024067217522fb86084e363059112a60c9e3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 30 Nov 2023 18:13:55 +0000
Subject: [PATCH 01/42] Bump rocm-docs-core from 0.28.0 to 0.30.0 in
 /docs/.sphinx

Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.28.0 to 0.30.0.
- [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases)
- [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.28.0...v0.30.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index 94103e1a..22885896 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -100,7 +100,7 @@ requests==2.31.0
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.28.0
+rocm-docs-core==0.30.0
     # via -r requirements.in
 smmap==5.0.0
     # via gitdb

From fc450d9a612feb91817b8b999c03fa1044695aa7 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Thu, 30 Nov 2023 23:05:50 +0000
Subject: [PATCH 02/42] Rename sample of permutation

Rename `permutation` to `simple_permutation` to comply with naming
conventions.
---
 samples/02_permutation/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/02_permutation/CMakeLists.txt b/samples/02_permutation/CMakeLists.txt
index 68857b54..ab66798c 100644
--- a/samples/02_permutation/CMakeLists.txt
+++ b/samples/02_permutation/CMakeLists.txt
@@ -26,7 +26,7 @@
 
 # Check whether building within hiptensor context
 if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
-    add_hiptensor_sample(permutation permutation.cpp)
+    add_hiptensor_sample(simple_permutation permutation.cpp)
 # If building hipTensor samples as a standalone Cmake project
 else()
     add_executable(permutation permutation.cpp)

From 9c4c7622a05dfbf8adae976e9be2b494372c439b Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Mon, 20 Nov 2023 22:24:28 +0000
Subject: [PATCH 03/42] Add unittest to for the permute CPU implementation

- compare the permute result with reference
- test col/row major
- test float and _Float16
---
 test/02_permutation/CMakeLists.txt            |   5 +-
 .../permutation_cpu_impl_test.cpp             | 159 ++++++++++++++++++
 2 files changed, 163 insertions(+), 1 deletion(-)
 create mode 100644 test/02_permutation/permutation_cpu_impl_test.cpp

diff --git a/test/02_permutation/CMakeLists.txt b/test/02_permutation/CMakeLists.txt
index 4334901c..bb2796ea 100644
--- a/test/02_permutation/CMakeLists.txt
+++ b/test/02_permutation/CMakeLists.txt
@@ -29,7 +29,10 @@ set(PermutationCommonSources ${HIPTENSOR_COMMON_TEST_SOURCES}
 
 #  tests
 set (PermutationTestSources ${PermutationCommonSources}
-                                    ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/permutation_cpu_impl_test.cpp
+    )
+
 set (PermutationTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/test_params.yaml)
 add_hiptensor_test(permutation_test ${PermutationTestConfig}  ${PermutationTestSources})
 
diff --git a/test/02_permutation/permutation_cpu_impl_test.cpp b/test/02_permutation/permutation_cpu_impl_test.cpp
new file mode 100644
index 00000000..014dbc61
--- /dev/null
+++ b/test/02_permutation/permutation_cpu_impl_test.cpp
@@ -0,0 +1,159 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <hiptensor/hiptensor.hpp>
+
+#include "data_types.hpp"
+#include "logger.hpp"
+#include "permutation/permutation_cpu_reference.hpp"
+#include "permutation_test.hpp"
+#include "utils.hpp"
+#include "llvm/hiptensor_options.hpp"
+
+template <typename floatTypeA, typename floatTypeB, typename floatTypeCompute>
+auto permuteWithCpu(hipDataType typeA, hipDataType typeB, hipDataType typeCompute)
+{
+    std::vector<int> modeA{'w', 'h', 'c', 'n'};
+    std::vector<int> modeB{'c', 'n', 'h', 'w'};
+    int              nmodeA = modeA.size();
+    int              nmodeB = modeB.size();
+
+    std::unordered_map<int, int64_t> extent;
+    extent['h'] = 2;
+    extent['w'] = 3;
+    extent['c'] = 4;
+    extent['n'] = 5;
+
+    std::vector<int64_t> extentA;
+    for(auto mode : modeA)
+    {
+        extentA.push_back(extent[mode]);
+    }
+    std::vector<int64_t> extentB;
+    for(auto mode : modeB)
+    {
+        extentB.push_back(extent[mode]);
+    }
+
+    /**********************
+     * Allocating data
+     **********************/
+
+    size_t elementsA = 1;
+    for(auto mode : modeA)
+    {
+        elementsA *= extent[mode];
+    }
+    size_t elementsB = 1;
+    for(auto mode : modeB)
+    {
+        elementsB *= extent[mode];
+    }
+
+    size_t sizeA = sizeof(floatTypeA) * elementsA;
+    size_t sizeB = sizeof(floatTypeB) * elementsB;
+
+    std::vector<floatTypeA> aArray(elementsA);
+    std::vector<floatTypeB> bArray(elementsB);
+    std::iota(aArray.begin(), aArray.end(), 0);
+
+#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+    std::vector<floatTypeB> referenceArray
+        = {0.,    12.6,  25.2,  37.8,  50.4,  63.,   75.6,  88.2,  100.8, 113.4, 126.,  138.6,
+           151.2, 163.8, 176.4, 189.,  201.6, 214.2, 226.8, 239.4, 6.3,   18.9,  31.5,  44.1,
+           56.7,  69.3,  81.9,  94.5,  107.1, 119.7, 132.3, 144.9, 157.5, 170.1, 182.7, 195.3,
+           207.9, 220.5, 233.1, 245.7, 2.1,   14.7,  27.3,  39.9,  52.5,  65.1,  77.7,  90.3,
+           102.9, 115.5, 128.1, 140.7, 153.3, 165.9, 178.5, 191.1, 203.7, 216.3, 228.9, 241.5,
+           8.4,   21.,   33.6,  46.2,  58.8,  71.4,  84.,   96.6,  109.2, 121.8, 134.4, 147.,
+           159.6, 172.2, 184.8, 197.4, 210.,  222.6, 235.2, 247.8, 4.2,   16.8,  29.4,  42.,
+           54.6,  67.2,  79.8,  92.4,  105.,  117.6, 130.2, 142.8, 155.4, 168.,  180.6, 193.2,
+           205.8, 218.4, 231.,  243.6, 10.5,  23.1,  35.7,  48.3,  60.9,  73.5,  86.1,  98.7,
+           111.3, 123.9, 136.5, 149.1, 161.7, 174.3, 186.9, 199.5, 212.1, 224.7, 237.3, 249.9};
+#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+    std::vector<floatTypeB> referenceArray
+        = {0.,   84.,   168.,  42.,  126.,  210.,  2.1,  86.1,  170.1, 44.1, 128.1, 212.1,
+           4.2,  88.2,  172.2, 46.2, 130.2, 214.2, 6.3,  90.3,  174.3, 48.3, 132.3, 216.3,
+           8.4,  92.4,  176.4, 50.4, 134.4, 218.4, 10.5, 94.5,  178.5, 52.5, 136.5, 220.5,
+           12.6, 96.6,  180.6, 54.6, 138.6, 222.6, 14.7, 98.7,  182.7, 56.7, 140.7, 224.7,
+           16.8, 100.8, 184.8, 58.8, 142.8, 226.8, 18.9, 102.9, 186.9, 60.9, 144.9, 228.9,
+           21.,  105.,  189.,  63.,  147.,  231.,  23.1, 107.1, 191.1, 65.1, 149.1, 233.1,
+           25.2, 109.2, 193.2, 67.2, 151.2, 235.2, 27.3, 111.3, 195.3, 69.3, 153.3, 237.3,
+           29.4, 113.4, 197.4, 71.4, 155.4, 239.4, 31.5, 115.5, 199.5, 73.5, 157.5, 241.5,
+           33.6, 117.6, 201.6, 75.6, 159.6, 243.6, 35.7, 119.7, 203.7, 77.7, 161.7, 245.7,
+           37.8, 121.8, 205.8, 79.8, 163.8, 247.8, 39.9, 123.9, 207.9, 81.9, 165.9, 249.9};
+
+#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+
+    const floatTypeCompute alphaValue = 2.1f;
+    hiptensorHandle_t*     handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+    hiptensorTensorDescriptor_t descA;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(
+        handle, &descA, nmodeA, extentA.data(), NULL /* stride */, typeA, HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t descB;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(
+        handle, &descB, nmodeB, extentB.data(), NULL /* stride */, typeB, HIPTENSOR_OP_IDENTITY));
+
+    hiptensor::detail::permuteByCpu(&alphaValue,
+                                    aArray.data(),
+                                    &descA,
+                                    modeA.data(),
+                                    bArray.data(),
+                                    &descB,
+                                    modeB.data(),
+                                    typeCompute);
+    return compareEqual(referenceArray.data(), bArray.data(), bArray.size(), 10);
+}
+
+TEST(PermutationCpuImplTest, CompareF32ResultWithReference)
+{
+    typedef float floatTypeA;
+    typedef float floatTypeB;
+    typedef float floatTypeCompute;
+
+    hipDataType typeA       = HIP_R_32F;
+    hipDataType typeB       = HIP_R_32F;
+    hipDataType typeCompute = HIP_R_32F;
+
+    auto [result, maxRelativeError]
+        = permuteWithCpu<floatTypeA, floatTypeB, floatTypeCompute>(typeA, typeB, typeCompute);
+    EXPECT_TRUE(result) << "max_relative_error: " << maxRelativeError;
+}
+
+TEST(PermutationCpuImplTest, CompareF16ResultWithReference)
+{
+    typedef _Float16 floatTypeA;
+    typedef _Float16 floatTypeB;
+    typedef _Float16 floatTypeCompute;
+
+    hipDataType typeA       = HIP_R_16F;
+    hipDataType typeB       = HIP_R_16F;
+    hipDataType typeCompute = HIP_R_16F;
+
+    auto [result, maxRelativeError]
+        = permuteWithCpu<floatTypeA, floatTypeB, floatTypeCompute>(typeA, typeB, typeCompute);
+    EXPECT_TRUE(result) << "max_relative_error: " << maxRelativeError;
+}

From bba3217ade0b285d9a1f718f8d74f8766121e63b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:01:47 +0000
Subject: [PATCH 04/42] Bump gitpython from 3.1.35 to 3.1.37 in /docs/.sphinx

Bumps [gitpython](https://github.com/gitpython-developers/GitPython) from 3.1.35 to 3.1.37.
- [Release notes](https://github.com/gitpython-developers/GitPython/releases)
- [Changelog](https://github.com/gitpython-developers/GitPython/blob/main/CHANGES)
- [Commits](https://github.com/gitpython-developers/GitPython/compare/3.1.35...3.1.37)

---
updated-dependencies:
- dependency-name: gitpython
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index 22885896..0441a14a 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -40,7 +40,7 @@ fastjsonschema==2.16.3
     # via rocm-docs-core
 gitdb==4.0.10
     # via gitpython
-gitpython==3.1.35
+gitpython==3.1.37
     # via rocm-docs-core
 idna==3.4
     # via requests

From 3f7a904e18f5473ef11f992bd5c285b1425b06cb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:02:23 +0000
Subject: [PATCH 05/42] Bump urllib3 from 1.26.15 to 1.26.18 in /docs/.sphinx

Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.15 to 1.26.18.
- [Release notes](https://github.com/urllib3/urllib3/releases)
- [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst)
- [Commits](https://github.com/urllib3/urllib3/compare/1.26.15...1.26.18)

---
updated-dependencies:
- dependency-name: urllib3
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index 22885896..60e980b6 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -143,7 +143,7 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 typing-extensions==4.5.0
     # via pydata-sphinx-theme
-urllib3==1.26.15
+urllib3==1.26.18
     # via requests
 wrapt==1.15.0
     # via deprecated

From 23b46d62952e22de4372bc40ffa1eaf5e59b3a9f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 7 Dec 2023 08:46:44 +0000
Subject: [PATCH 06/42] Bump rocm-docs-core from 0.30.0 to 0.30.1 in
 /docs/.sphinx

Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.30.0 to 0.30.1.
- [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases)
- [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.30.0...v0.30.1)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index 22885896..17b81d3d 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -100,7 +100,7 @@ requests==2.31.0
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.30.0
+rocm-docs-core==0.30.1
     # via -r requirements.in
 smmap==5.0.0
     # via gitdb

From 852992e891fa54f7e3384485f5f12294f26af385 Mon Sep 17 00:00:00 2001
From: Sam Wu <sam.wu2@amd.com>
Date: Thu, 7 Dec 2023 11:04:21 -0700
Subject: [PATCH 07/42] Fix spelling in documentation (#155)

* Fix spelling in documentation

* Use code directive to escape code keywords

* Revert "Use code directive to escape code keywords"

This reverts commit 7be7e3446830f2d7dd2760f140c7dfa30a246f78.

* Disable spellcheck on API Reference Guide

* Fix spelling in API Reference Guide

hiptensor > hipTensor
---
 docs/API_Reference_Guide.rst |  7 +++++--
 docs/Contributors_Guide.rst  | 25 +++++++++++------------
 docs/Linux_Install_Guide.rst | 29 ++++++++++++++-------------
 docs/Programmers_Guide.rst   | 39 ++++++++++++++++++------------------
 docs/index.rst               |  4 ++--
 5 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/docs/API_Reference_Guide.rst b/docs/API_Reference_Guide.rst
index 551e2ee0..77e86343 100644
--- a/docs/API_Reference_Guide.rst
+++ b/docs/API_Reference_Guide.rst
@@ -3,15 +3,16 @@
 Introduction
 ************
 
-hiptensor Data Types
+hipTensor Data Types
 ====================
 
+.. <!-- spellcheck-disable -->
+
 hiptensorStatus_t
 -----------------
 
 .. doxygenenum::  hiptensorStatus_t
 
-
 hiptensorComputeType_t
 ----------------------
 
@@ -160,3 +161,5 @@ hiptensorLoggerForceDisable
 ---------------------------
 
 .. doxygenfunction::  hiptensorLoggerForceDisable
+
+.. <!-- spellcheck-enable -->
diff --git a/docs/Contributors_Guide.rst b/docs/Contributors_Guide.rst
index aeb87211..d75a884b 100644
--- a/docs/Contributors_Guide.rst
+++ b/docs/Contributors_Guide.rst
@@ -15,8 +15,7 @@ License Agreement
 Pull-request guidelines
 =======================
 
-
-Our code contriubtion guidelines closely follows the model of `GitHub
+Our code contribution guidelines closely follows the model of `GitHub
 pull-requests <https://help.github.com/articles/using-pull-requests/>`__.
 The hipTensor repository follows a workflow which dictates a /master branch where releases are cut, and a
 /develop branch which serves as an integration branch for new code. Pull requests should:
@@ -30,7 +29,7 @@ The hipTensor repository follows a workflow which dictates a /master branch wher
 -  code must also have benchmark tests, and performance must approach
    the compute bound limit or memory bound limit.
 
-StyleGuide
+Style Guide
 ==========
 
 This project follows the `CPP Core
@@ -44,7 +43,7 @@ Interface
 ---------
 
 -  Library code should use C++17
--  Avoid CamelCase
+-  Avoid Camel case
 -  This rule applies specifically to publicly visible APIs, but is also
    encouraged (not mandated) for internal code
 
@@ -52,8 +51,8 @@ Philosophy
 ----------
 
 -  `P.2 <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rp-Cplusplus>`__:
-   Write in ISO Standard C++14 (especially to support windows, linux and
-   macos plaforms )
+   Write in ISO Standard C++14 (especially to support Windows, Linux and
+   macOS platforms )
 -  `P.5 <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rp-compile-time>`__:
    Prefer compile-time checking to run-time checking
 
@@ -105,19 +104,19 @@ will result in different results.
 
 To format a file, use:
 
-::
+.. code-block::
 
-    /opt/rocm/llvm/bin/clang-format -style=file -i <path-to-source-file>
+   /opt/rocm/llvm/bin/clang-format -style=file -i <path-to-source-file>
 
 To format all files, run the following script in hipTensor directory:
 
-::
+.. code-block::
 
-    #!/bin/bash
-    git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i
+   #!/bin/bash
+   git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i
 
 Also, githooks can be installed to format the code per-commit:
 
-::
+.. code-block::
 
-    ./.githooks/install
+   ./.githooks/install
diff --git a/docs/Linux_Install_Guide.rst b/docs/Linux_Install_Guide.rst
index 47cdc339..ace565c1 100644
--- a/docs/Linux_Install_Guide.rst
+++ b/docs/Linux_Install_Guide.rst
@@ -104,9 +104,9 @@ Minimum ROCm version support is 5.7.
 
 By default, the project is configured as Release mode.
 
-To build only library, run the following comomand :
+To build only library, run the following command :
 
-    CC=hipcc CXX=hipcc cmake -B<build_dir> . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=OFF
+    :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=OFF`
 
 Here are some other example project configurations:
 
@@ -116,30 +116,30 @@ Here are some other example project configurations:
 +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+
 |         Configuration             |                                          Command                                                                   |
 +===================================+====================================================================================================================+
-|            Basic                  |                                CC=hipcc CXX=hipcc cmake -B<build_dir> .                                            |
+|            Basic                  |                        :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> .`                                            |
 +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+
-|        Targeting gfx908           |                   CC=hipcc CXX=hipcc cmake -B<build_dir> . -DAMDGPU_TARGETS=gfx908:xnack-                          |
+|        Targeting gfx908           |           :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> . -DAMDGPU_TARGETS=gfx908:xnack-`                          |
 +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+
-|          Debug build              |                    CC=hipcc CXX=hipcc cmake -B<build_dir> . -DCMAKE_BUILD_TYPE=Debug                               |
+|          Debug build              |                    :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> . -DCMAKE_BUILD_TYPE=Debug`                       |
 +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+
 
 After configuration, build with
 
-    cmake --build <build_dir> -- -j
+    :code:`cmake --build <build_dir> -- -j`
 
 
 Build library + samples
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-To build library and samples, run the following comomand :
+To build library and samples, run the following command:
 
-    CC=hipcc CXX=hipcc cmake -B<build_dir> . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=ON
+    :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=ON`
 
 After configuration, build with
 
-    cmake --build <build_dir> -- -j
+    :code:`cmake --build <build_dir> -- -j`
 
-The samples folder in <build_dir> contains executables in the table below.
+The samples folder in :code:`<build_dir>` contains executables in the table below.
 
 =================================== ===================================================================================
 executable name                     description
@@ -154,13 +154,13 @@ Build library + tests
 
 To build library and tests, run the following command :
 
-    CC=hipcc CXX=hipcc cmake -B<build_dir> .
+    :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> .`
 
 After configuration, build with
 
-    cmake --build <build_dir> -- -j
+    :code:`cmake --build <build_dir> -- -j`
 
-The tests in <build_dir> contains executables in the table below.
+The tests in `<build_dir>` contains executables in the table below.
 
 ====================================== ===================================================================================
 executable name                        description
@@ -177,6 +177,7 @@ Build library + Documentation
 
 Run the steps below to build documentation locally.
 
+.. code-block::
     cd docs
 
     sudo apt-get update
@@ -191,4 +192,4 @@ Run the steps below to build documentation locally.
 
     pdflatex hiptensor.tex
 
-Generates hiptensor.pdf here
+Generates :code:`hiptensor.pdf` here
diff --git a/docs/Programmers_Guide.rst b/docs/Programmers_Guide.rst
index 460bb970..1eaf9adf 100644
--- a/docs/Programmers_Guide.rst
+++ b/docs/Programmers_Guide.rst
@@ -1,4 +1,3 @@
-
 ===================
 Programmer's Guide
 ===================
@@ -17,13 +16,13 @@ The hipTensor code is split into four major parts:
 The `library` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-library/include/hiptensor/
+`library/include/hiptensor/`
 '''''''''''''''''''''''''''
 
 Contains C++ include files for the hipTensor API. These files also contain Doxygen
 comments that document the API.
 
-library/include/hiptensor/internal
+`library/include/hiptensor/internal`
 ''''''''''''''''''''''''''''''''''
 
 Internal include files for:
@@ -31,58 +30,58 @@ Internal include files for:
 - Utility Code
 - Generate Tensor Utility
 
-library/src/
+`library/src/`
 ''''''''''''
 
 Contains logger, device and performance functions.
 
-library/src/contraction/
+`library/src/contraction/`
 ''''''''''''''''''''''''
 
 Contains hipTensor core composable kernel header functions and contraction initialization functions.
 
-library/src/contraction/device
+`library/src/contraction/device`
 ''''''''''''''''''''''''''''''
 
 Contains hipTensor Bilinear and Scale instance functions
 
 The `samples` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
-01_contraction/simple_bilinear_contraction_f32.cpp
+`01_contraction/simple_bilinear_contraction_f32.cpp`
 ''''''''''''''''''''''''''''''''''''''''''''''''''
 
-sample code for calling bilinear contraction for fp32 input, output and compute types
+sample code for calling bilinear contraction for :code:`fp32` input, output and compute types
 
 
-01_contraction/simple_scale_contraction_f32.cpp
+`01_contraction/simple_scale_contraction_f32.cpp`
 '''''''''''''''''''''''''''''''''''''''''''''''
 
-sample code for calling scale contraction for fp32 input, output and compute types
+sample code for calling scale contraction for :code:`fp32` input, output and compute types
 
 The `test` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-00_unit/logger
+`00_unit/logger`
 ''''''''''''''
 
 Test code for testing logger API Functions of hipTensor
 
-01_contraction/bilinear_contraction_f32
+`01_contraction/bilinear_contraction_f32`
 '''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the bilinear contraction functionality and log metrics for F32 types.
 
-01_contraction/bilinear_contraction_f64
+`01_contraction/bilinear_contraction_f64`
 '''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the bilinear contraction functionality and log metrics for F64 types.
 
-01_contraction/scale_contraction_f32
+`01_contraction/scale_contraction_f32`
 ''''''''''''''''''''''''''''''''''''
 
 Test code for testing the scale contraction functionality and log metrics for F32 types.
 
-01_contraction/scale_contraction_f64
+`01_contraction/scale_contraction_f64`
 ''''''''''''''''''''''''''''''''''''
 
 Test code for testing the scale contraction functionality and log metrics for F64 types.
@@ -90,11 +89,11 @@ Test code for testing the scale contraction functionality and log metrics for F6
 Infrastructure
 ^^^^^^^^^^^^^^
 
-- CMake is used to build and package hipTensor. There are CMakeLists.txt files throughout the code.
-- Doxygen/Breathe/Sphinx/ReadTheDocs are used to produce documentation. Content for the documentation is from:
+- CMake is used to build and package hipTensor. There are :code:`CMakeLists.txt` files throughout the code.
+- `Doxygen/Breathe/Sphinx/ReadtheDocs` are used to produce documentation. Content for the documentation is from:
 
-  - Doxygen comments in include files in the directory library/include
-  - files in the directory docs/
+  - Doxygen comments in include files in the directory :code:`library/include`
+  - files in the directory :code:`docs/`
 
 - Jenkins is used to automate Continuous Integration testing.
-- clang-format is used to format C++ code.
+- :code:`clang-format` is used to format C++ code.
diff --git a/docs/index.rst b/docs/index.rst
index 566a00e5..ba5e1cb7 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,6 +1,6 @@
 ============================================================================
-hiptensor: A High-Performance HIP Library For Tensor Primitives
+hipTensor: A High-Performance HIP Library For Tensor Primitives
 ============================================================================
 
-hiptensor is AMD's C++ library for accelerating tensor primitives based on the
+hipTensor is AMD's C++ library for accelerating tensor primitives based on the
 composable kernel library, through general purpose kernel languages, like HIP C++.

From c5fbcec9afdfe109b1ba1a15d74d819beec2e6fe Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 22 Nov 2023 16:21:33 +0000
Subject: [PATCH 08/42] Add support to f16 and bf16 to contraction

- Support _Float16
- Support hip_bfloat16
- Add unit test of _Float16 and hip_bfloat16
- Add sample of _Float16 and hip_bfloat16
---
 .../hiptensor/internal}/config.hpp            |   0
 .../hiptensor/internal/hiptensor_utility.hpp  |   1 +
 .../hiptensor/internal}/native_types.hpp      |   0
 .../hiptensor/internal}/native_types_impl.hpp |   0
 .../hiptensor/internal}/type_traits.hpp       |   0
 .../hiptensor/internal}/types.hpp             |   0
 .../hiptensor/internal}/types_ext.hpp         |   0
 .../hiptensor/internal}/xfloat32.hpp          |   0
 .../contraction_cpu_reference_impl.hpp        |  19 +-
 .../contraction_cpu_reference_instances.cpp   |  56 +++
 .../contraction/contraction_meta_traits.hpp   |  47 ++-
 .../src/contraction/contraction_selection.cpp | 280 +++++++++++++-
 .../src/contraction/contraction_solution.hpp  |   3 +-
 .../contraction/contraction_solution_impl.hpp |   6 +-
 .../contraction_solution_instances.cpp        |  57 +++
 library/src/contraction/device/CMakeLists.txt |  56 ++-
 ...16_bf16_bf16_compute_f32_kknn_instance.cpp |  62 ++++
 ...16_bf16_bf16_compute_f32_knnn_instance.cpp |  62 ++++
 ...16_bf16_bf16_compute_f32_mknn_instance.cpp |  62 ++++
 ...16_bf16_bf16_compute_f32_mnnn_instance.cpp |  62 ++++
 ..._f16_f16_f16_compute_f32_kknn_instance.cpp |  62 ++++
 ..._f16_f16_f16_compute_f32_knnn_instance.cpp |  62 ++++
 ..._f16_f16_f16_compute_f32_mknn_instance.cpp |  62 ++++
 ..._f16_f16_f16_compute_f32_mnnn_instance.cpp |  62 ++++
 ...f16_bf16_bf16_compute_f32_kkn_instance.cpp |  62 ++++
 ...f16_bf16_bf16_compute_f32_knn_instance.cpp |  62 ++++
 ...f16_bf16_bf16_compute_f32_mkn_instance.cpp |  62 ++++
 ...f16_bf16_bf16_compute_f32_mnn_instance.cpp |  62 ++++
 ...e_f16_f16_f16_compute_f32_kkn_instance.cpp |  62 ++++
 ...e_f16_f16_f16_compute_f32_knn_instance.cpp |  62 ++++
 ...e_f16_f16_f16_compute_f32_mkn_instance.cpp |  62 ++++
 ...e_f16_f16_f16_compute_f32_mnn_instance.cpp |  62 ++++
 ...hpp => hiptensor_contraction_bilinear.hpp} |   0
 ...le.hpp => hiptensor_contraction_scale.hpp} |   0
 .../src/contraction/hiptensor_contraction.cpp |  11 -
 library/src/hiptensor.cpp                     |   3 +-
 samples/01_contraction/CMakeLists.txt         |  15 +
 .../simple_bilinear_contraction_bf16.cpp      | 342 ++++++++++++++++++
 .../simple_bilinear_contraction_f16.cpp       | 342 ++++++++++++++++++
 .../simple_scale_contraction_bf16.cpp         | 334 +++++++++++++++++
 .../simple_scale_contraction_f16.cpp          | 334 +++++++++++++++++
 .../configs/bilinear_test_params.yaml         |   2 +
 .../configs/scale_test_params.yaml            |   2 +
 test/01_contraction/contraction_test.cpp      | 109 +++++-
 test/device/common.hpp                        |   2 +-
 test/llvm/yaml_parser_config.cpp              |   1 +
 test/utils.hpp                                |   2 +-
 47 files changed, 2944 insertions(+), 72 deletions(-)
 rename library/{src/include => include/hiptensor/internal}/config.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/native_types.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/native_types_impl.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/type_traits.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/types.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/types_ext.hpp (100%)
 rename library/{src/include => include/hiptensor/internal}/xfloat32.hpp (100%)
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
 rename library/src/contraction/device/{contraction_bilinear.hpp => hiptensor_contraction_bilinear.hpp} (100%)
 rename library/src/contraction/device/{contraction_scale.hpp => hiptensor_contraction_scale.hpp} (100%)
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_bf16.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f16.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_bf16.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f16.cpp

diff --git a/library/src/include/config.hpp b/library/include/hiptensor/internal/config.hpp
similarity index 100%
rename from library/src/include/config.hpp
rename to library/include/hiptensor/internal/config.hpp
diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp
index f2df2dd2..c386bbe0 100644
--- a/library/include/hiptensor/internal/hiptensor_utility.hpp
+++ b/library/include/hiptensor/internal/hiptensor_utility.hpp
@@ -31,6 +31,7 @@
 #include <iostream>
 
 #include "../hiptensor_types.hpp"
+#include "types_ext.hpp"
 
 #ifndef CHECK_HIP_ERROR
 #define CHECK_HIP_ERROR(expression)                      \
diff --git a/library/src/include/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp
similarity index 100%
rename from library/src/include/native_types.hpp
rename to library/include/hiptensor/internal/native_types.hpp
diff --git a/library/src/include/native_types_impl.hpp b/library/include/hiptensor/internal/native_types_impl.hpp
similarity index 100%
rename from library/src/include/native_types_impl.hpp
rename to library/include/hiptensor/internal/native_types_impl.hpp
diff --git a/library/src/include/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp
similarity index 100%
rename from library/src/include/type_traits.hpp
rename to library/include/hiptensor/internal/type_traits.hpp
diff --git a/library/src/include/types.hpp b/library/include/hiptensor/internal/types.hpp
similarity index 100%
rename from library/src/include/types.hpp
rename to library/include/hiptensor/internal/types.hpp
diff --git a/library/src/include/types_ext.hpp b/library/include/hiptensor/internal/types_ext.hpp
similarity index 100%
rename from library/src/include/types_ext.hpp
rename to library/include/hiptensor/internal/types_ext.hpp
diff --git a/library/src/include/xfloat32.hpp b/library/include/hiptensor/internal/xfloat32.hpp
similarity index 100%
rename from library/src/include/xfloat32.hpp
rename to library/include/hiptensor/internal/xfloat32.hpp
diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index 673f6dff..ac4fc20d 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -57,6 +57,7 @@ namespace hiptensor
         typename AElementwiseOperation,
         typename BElementwiseOperation,
         typename CDEElementwiseOperation,
+        typename ComputeDataType = ADataType,
         ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2 && DsDataType::Size() <= 1,
                         bool>
         = false>
@@ -70,7 +71,8 @@ namespace hiptensor
                                                                           EDataType,
                                                                           AElementwiseOperation,
                                                                           BElementwiseOperation,
-                                                                          CDEElementwiseOperation>
+                                                                          CDEElementwiseOperation,
+                                                                          ComputeDataType>
     {
         using BaseArgument = ck::tensor_operation::device::BaseArgument;
         using BaseInvoker  = ck::tensor_operation::device::BaseInvoker;
@@ -324,7 +326,8 @@ namespace hiptensor
               typename AccumDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ReferenceContraction_M2_N2_K2<NumDimsM,
                                                     NumDimsN,
                                                     NumDimsK,
@@ -335,7 +338,8 @@ namespace hiptensor
                                                     AccumDataType,
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
-                                                    CDEElementwiseOperation>>
+                                                    CDEElementwiseOperation,
+                                                    ComputeDataType>>
         : public MetaTraits<
               ck::tensor_operation::device::DeviceContractionMultipleD<NumDimsM,
                                                                        NumDimsN,
@@ -346,7 +350,8 @@ namespace hiptensor
                                                                        EDataType,
                                                                        AElementwiseOperation,
                                                                        BElementwiseOperation,
-                                                                       CDEElementwiseOperation>>
+                                                                       CDEElementwiseOperation,
+                                                                       ComputeDataType>>
     {
     };
 
@@ -359,7 +364,8 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType = ADataType>
     auto enumerateReferenceSolutions()
     {
         using ReferenceOp = ReferenceContraction_M2_N2_K2<NumDimM,
@@ -372,7 +378,8 @@ namespace hiptensor
                                                           EDataType,
                                                           AElementwiseOperation,
                                                           BElementwiseOperation,
-                                                          CDEElementwiseOperation>;
+                                                          CDEElementwiseOperation,
+                                                          ComputeDataType>;
 
         auto solution = std::make_unique<ContractionSolutionImpl<ReferenceOp>>(
             std::make_unique<ReferenceOp>());
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 106dd5ff..146d2721 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -32,6 +32,34 @@ namespace hiptensor
     ContractionCpuReferenceInstances::ContractionCpuReferenceInstances()
     {
         // Register all the solutions exactly once
+        // Bilinear f16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        _Float16,
+                                        _Float16,
+                                        ck::Tuple<_Float16>,
+                                        _Float16,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
+        // Bilinear bf16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        ck::bhalf_t,
+                                        ck::bhalf_t,
+                                        ck::Tuple<ck::bhalf_t>,
+                                        ck::bhalf_t,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
         // Bilinear f32
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -58,6 +86,34 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::Bilinear>());
 
+        // Scale f16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        _Float16,
+                                        _Float16,
+                                        ck::Tuple<>,
+                                        _Float16,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
+        // Scale bf16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        ck::bhalf_t,
+                                        ck::bhalf_t,
+                                        ck::Tuple<>,
+                                        ck::bhalf_t,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
         // Scale f32
         registerSolutions(
             enumerateReferenceSolutions<2,
diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index 4fa7acf7..ab158f96 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -49,7 +49,8 @@ namespace hiptensor
               typename DsDataType,
               typename EDataType,
               typename AElementwiseOperation,
-              typename BElementwiseOperation>
+              typename BElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ck::tensor_operation::device::DeviceContractionMultipleD<
         NumDimsM,
         NumDimsN,
@@ -60,18 +61,23 @@ namespace hiptensor
         EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        ck::tensor_operation::element_wise::Bilinear>>
+        ck::tensor_operation::element_wise::Bilinear,
+        ComputeDataType>>
     {
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
         constexpr static ck::index_t DimsK = NumDimsK;
-        using ADataT                       = ADataType;
-        using BDataT                       = BDataType;
-        using DDataT                       = DsDataType;
-        using EDataT                       = EDataType;
-        using AOp                          = AElementwiseOperation;
-        using BOp                          = BElementwiseOperation;
-        using CDEOp                        = ck::tensor_operation::element_wise::Bilinear;
+        using ADataT
+            = std::conditional_t<std::is_same_v<ADataType, ck::bhalf_t>, hip_bfloat16, ADataType>;
+        using BDataT
+            = std::conditional_t<std::is_same_v<BDataType, ck::bhalf_t>, hip_bfloat16, BDataType>;
+        using DDataT
+            = std::conditional_t<std::is_same_v<DsDataType, ck::bhalf_t>, hip_bfloat16, DsDataType>;
+        using EDataT
+            = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
+        using AOp   = AElementwiseOperation;
+        using BOp   = BElementwiseOperation;
+        using CDEOp = ck::tensor_operation::element_wise::Bilinear;
     };
 
     // Partial specialize for Scale contraction
@@ -82,7 +88,8 @@ namespace hiptensor
               typename BDataType,
               typename EDataType,
               typename AElementwiseOperation,
-              typename BElementwiseOperation>
+              typename BElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ck::tensor_operation::device::DeviceContractionMultipleD<
         NumDimsM,
         NumDimsN,
@@ -93,18 +100,22 @@ namespace hiptensor
         EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        ck::tensor_operation::element_wise::Scale>>
+        ck::tensor_operation::element_wise::Scale,
+        ComputeDataType>>
     {
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
         constexpr static ck::index_t DimsK = NumDimsK;
-        using ADataT                       = ADataType;
-        using BDataT                       = BDataType;
-        using DDataT                       = NoneType;
-        using EDataT                       = EDataType;
-        using AOp                          = AElementwiseOperation;
-        using BOp                          = BElementwiseOperation;
-        using CDEOp                        = ck::tensor_operation::element_wise::Scale;
+        using ADataT
+            = std::conditional_t<std::is_same_v<ADataType, ck::bhalf_t>, hip_bfloat16, ADataType>;
+        using BDataT
+            = std::conditional_t<std::is_same_v<BDataType, ck::bhalf_t>, hip_bfloat16, BDataType>;
+        using DDataT = NoneType;
+        using EDataT
+            = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
+        using AOp   = AElementwiseOperation;
+        using BOp   = BElementwiseOperation;
+        using CDEOp = ck::tensor_operation::element_wise::Scale;
     };
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index aaa624f6..1b2cf92e 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -150,6 +150,192 @@ namespace hiptensor
         }
     }
 
+    // test
+    template <>
+    struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::SCALE>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // TODO select unique_id
+            unique_id = 7255639152084218514;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::BILINEAR>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // select unique_id
+            unique_id = 7255639152084218514;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                ContractionOpId_t::SCALE>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // TODO select unique_id
+            unique_id = 8689089455041651212;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                ContractionOpId_t::BILINEAR>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // select unique_id
+            unique_id = 8689089455041651212;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+    // end test
+
     template <>
     struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>
     {
@@ -1418,7 +1604,99 @@ namespace hiptensor
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
                          const uint64_t                                          workspaceSize)
     {
-        if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F)
+        if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F)
+        {
+            return ActorCriticSelection<_Float16,
+                                        _Float16,
+                                        _Float16,
+                                        _Float16,
+                                        ContractionOpId_t::SCALE>::selectWinner(winner,
+                                                                                candidates,
+                                                                                typeA,
+                                                                                a_ms_ks_lengths,
+                                                                                a_ms_ks_strides,
+                                                                                typeB,
+                                                                                b_ns_ks_lengths,
+                                                                                b_ns_ks_strides,
+                                                                                typeD,
+                                                                                d_ms_ns_lengths,
+                                                                                d_ms_ns_strides,
+                                                                                typeE,
+                                                                                e_ms_ns_lengths,
+                                                                                e_ms_ns_strides,
+                                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F
+                && typeE == HIP_R_16F)
+        {
+            return ActorCriticSelection<_Float16,
+                                        _Float16,
+                                        _Float16,
+                                        _Float16,
+                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
+                                                                                   candidates,
+                                                                                   typeA,
+                                                                                   a_ms_ks_lengths,
+                                                                                   a_ms_ks_strides,
+                                                                                   typeB,
+                                                                                   b_ns_ks_lengths,
+                                                                                   b_ns_ks_strides,
+                                                                                   typeD,
+                                                                                   d_ms_ns_lengths,
+                                                                                   d_ms_ns_strides,
+                                                                                   typeE,
+                                                                                   e_ms_ns_lengths,
+                                                                                   e_ms_ns_strides,
+                                                                                   workspaceSize);
+        }
+        else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE
+                && typeE == HIP_R_16BF)
+        {
+            return ActorCriticSelection<hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        ContractionOpId_t::SCALE>::selectWinner(winner,
+                                                                                candidates,
+                                                                                typeA,
+                                                                                a_ms_ks_lengths,
+                                                                                a_ms_ks_strides,
+                                                                                typeB,
+                                                                                b_ns_ks_lengths,
+                                                                                b_ns_ks_strides,
+                                                                                typeD,
+                                                                                d_ms_ns_lengths,
+                                                                                d_ms_ns_strides,
+                                                                                typeE,
+                                                                                e_ms_ns_lengths,
+                                                                                e_ms_ns_strides,
+                                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF
+                && typeE == HIP_R_16BF)
+        {
+            return ActorCriticSelection<hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
+                                                                                   candidates,
+                                                                                   typeA,
+                                                                                   a_ms_ks_lengths,
+                                                                                   a_ms_ks_strides,
+                                                                                   typeB,
+                                                                                   b_ns_ks_lengths,
+                                                                                   b_ns_ks_strides,
+                                                                                   typeD,
+                                                                                   d_ms_ns_lengths,
+                                                                                   d_ms_ns_strides,
+                                                                                   typeE,
+                                                                                   e_ms_ns_lengths,
+                                                                                   e_ms_ns_strides,
+                                                                                   workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE
+                && typeE == HIP_R_32F)
         {
             return ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>::
                 selectWinner(winner,
diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp
index 0037584e..e76bb351 100644
--- a/library/src/contraction/contraction_solution.hpp
+++ b/library/src/contraction/contraction_solution.hpp
@@ -147,7 +147,8 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType>
     std::vector<std::unique_ptr<hiptensor::ContractionSolution>> enumerateContractionSolutions();
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp
index 0fb5df9d..5e191441 100644
--- a/library/src/contraction/contraction_solution_impl.hpp
+++ b/library/src/contraction/contraction_solution_impl.hpp
@@ -274,7 +274,8 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType = ADataType>
     std::vector<std::unique_ptr<hiptensor::ContractionSolution>> enumerateContractionSolutions()
     {
         using ContractionOp
@@ -287,7 +288,8 @@ namespace hiptensor
                                                                        EDataType,
                                                                        AElementwiseOperation,
                                                                        BElementwiseOperation,
-                                                                       CDEElementwiseOperation>;
+                                                                       CDEElementwiseOperation,
+                                                                       ComputeDataType>;
 
         using Factory
             = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<ContractionOp>;
diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index fd263a8b..6d481577 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -32,6 +32,35 @@ namespace hiptensor
     ContractionSolutionInstances::ContractionSolutionInstances()
     {
         // Register all the solutions exactly once
+
+        // Bilinear bf16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::bhalf_t,
+                                          ck::bhalf_t,
+                                          ck::Tuple<ck::bhalf_t>,
+                                          ck::bhalf_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
+        // Bilinear f16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::half_t,
+                                          ck::half_t,
+                                          ck::Tuple<ck::half_t>,
+                                          ck::half_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
         // Bilinear f32
         registerSolutions(
             enumerateContractionSolutions<2,
@@ -58,6 +87,34 @@ namespace hiptensor
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::Bilinear>());
 
+        // Scale bf16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::bhalf_t,
+                                          ck::bhalf_t,
+                                          ck::Tuple<>,
+                                          ck::bhalf_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        // Scale f16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::half_t,
+                                          ck::half_t,
+                                          ck::Tuple<>,
+                                          ck::half_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
         // Scale f32
         registerSolutions(
             enumerateContractionSolutions<2,
diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt
index f2e4a0fb..b9b382c0 100644
--- a/library/src/contraction/device/CMakeLists.txt
+++ b/library/src/contraction/device/CMakeLists.txt
@@ -24,24 +24,40 @@
  #
  ###############################################################################
 
-set(CK_CONTRACTION_INSTANCE_SOURCES
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
-)
+ set(CK_CONTRACTION_INSTANCE_SOURCES
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+     )
 
-add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
-target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
+ add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
+ target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..7d777a83
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
+                    = device_contraction_kk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..a9a97148
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance
+                    = device_contraction_kn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..d83d8d16
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance
+                    = device_contraction_mk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..bc49c82b
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance
+                    = device_contraction_mn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..a9d963ab
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance
+                    = device_contraction_kk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..c139942e
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance
+                    = device_contraction_kn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..3c6ced30
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance
+                    = device_contraction_mk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..33c66296
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance
+                    = device_contraction_mn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..05400151
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance
+                    = device_contraction_kk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..bba95b14
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance
+                    = device_contraction_kn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..fb5ecec0
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance
+                    = device_contraction_mk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..1dd6613c
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance
+                    = device_contraction_mn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..e98aee20
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance
+                    = device_contraction_kk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..db8de1c0
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance
+                    = device_contraction_kn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..397ef327
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance
+                    = device_contraction_mk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..1f9221dc
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance
+                    = device_contraction_mn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear.hpp
similarity index 100%
rename from library/src/contraction/device/contraction_bilinear.hpp
rename to library/src/contraction/device/hiptensor_contraction_bilinear.hpp
diff --git a/library/src/contraction/device/contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale.hpp
similarity index 100%
rename from library/src/contraction/device/contraction_scale.hpp
rename to library/src/contraction/device/hiptensor_contraction_scale.hpp
diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index 09f5ddf6..b96a204e 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -708,17 +708,6 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         return errorCode;
     }
 
-    if(plan->mContractionDesc.mComputeType != plan->mContractionDesc.mTensorDesc[3].mType)
-    {
-        auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
-        snprintf(msg,
-                 sizeof(msg),
-                 "Internal Error : compute type != D type (%s)",
-                 hiptensorGetErrorString(errorCode));
-        logger->logError("hiptensorContraction", msg);
-        return errorCode;
-    }
-
     auto* cSolution = (hiptensor::ContractionSolution*)(plan->mSolution);
 
     auto canRun = cSolution->initArgs(alpha,
diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp
index 9740d2a8..51af1f48 100644
--- a/library/src/hiptensor.cpp
+++ b/library/src/hiptensor.cpp
@@ -152,7 +152,8 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t*     han
     }
 
     if((lens == nullptr)
-       || ((dataType != HIP_R_16F) && (dataType != HIP_R_32F) && (dataType != HIP_R_64F))
+       || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F)
+           && (dataType != HIP_R_64F))
        || unaryOp != HIPTENSOR_OP_IDENTITY)
     {
         auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index ada3ce61..15972d60 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -26,15 +26,30 @@
 
 # Check whether building within hiptensor context
 if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
+    add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
+    add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
     add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
     add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
 
 # If building hipTensor samples as a standalone Cmake project
 else()
+    add_executable(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
+    target_link_libraries(simple_contraction_scale_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
+    target_link_libraries(simple_contraction_scale_bf16 PRIVATE hiptensor::hiptensor)
 
     add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
     target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor)
 
+    add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
+    target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
+    target_link_libraries(simple_contraction_bilinear_bf16 PRIVATE hiptensor::hiptensor)
+
     add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
     target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor)
 
diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
new file mode 100644
index 00000000..0a4a9314
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
@@ -0,0 +1,342 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef hip_bfloat16 ADataType;
+    typedef hip_bfloat16 BDataType;
+    typedef hip_bfloat16 CDataType;
+    typedef float        floatTypeCompute;
+
+    hipDataType            typeA       = HIP_R_16BF;
+    hipDataType            typeB       = HIP_R_16BF;
+    hipDataType            typeC       = HIP_R_16BF;
+    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.1f;
+    floatTypeCompute beta  = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
+   *C_{m,n,u,v}
+   **********************/
+
+    std::vector<int> modeC{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeC = modeC.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 5;
+    extent['n'] = 6;
+    extent['u'] = 3;
+    extent['v'] = 4;
+    extent['h'] = 3;
+    extent['k'] = 4;
+
+    std::vector<int64_t> c_ms_ns_lengths;
+    for(auto mode : modeC)
+    {
+        c_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t c_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &c_ms_ns,
+                                                        nmodeC,
+                                                        c_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeC,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsC = std::accumulate(
+        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeC = sizeof(CDataType) * elementsC;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    CDataType* C = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
+
+    void *A_d, *B_d, *C_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
+
+    /*******************
+   * Initialize data
+   *******************/
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsC; i++)
+    {
+        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementC;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               (void*)&beta,
+                                               C_d,
+                                               C_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor C elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorC;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorC.open("tensor_C_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(C);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(C_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16.cpp
new file mode 100644
index 00000000..d9d044c9
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f16.cpp
@@ -0,0 +1,342 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef _Float16 ADataType;
+    typedef _Float16 BDataType;
+    typedef _Float16 CDataType;
+    typedef float    floatTypeCompute;
+
+    hipDataType            typeA       = HIP_R_16F;
+    hipDataType            typeB       = HIP_R_16F;
+    hipDataType            typeC       = HIP_R_16F;
+    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.1f;
+    floatTypeCompute beta  = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
+   *C_{m,n,u,v}
+   **********************/
+
+    std::vector<int> modeC{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeC = modeC.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 5;
+    extent['n'] = 6;
+    extent['u'] = 3;
+    extent['v'] = 4;
+    extent['h'] = 3;
+    extent['k'] = 4;
+
+    std::vector<int64_t> c_ms_ns_lengths;
+    for(auto mode : modeC)
+    {
+        c_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t c_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &c_ms_ns,
+                                                        nmodeC,
+                                                        c_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeC,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsC = std::accumulate(
+        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeC = sizeof(CDataType) * elementsC;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    CDataType* C = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
+
+    void *A_d, *B_d, *C_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
+
+    /*******************
+   * Initialize data
+   *******************/
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsC; i++)
+    {
+        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementC;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               (void*)&beta,
+                                               C_d,
+                                               C_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor C elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorC;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorC.open("tensor_C_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(C);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(C_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16.cpp
new file mode 100644
index 00000000..e05916bf
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_bf16.cpp
@@ -0,0 +1,334 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    typedef hip_bfloat16 ADataType;
+    typedef hip_bfloat16 BDataType;
+    typedef hip_bfloat16 DDataType;
+    typedef float        floatTypeCompute;
+
+    hipDataType            typeA       = HIP_R_16BF;
+    hipDataType            typeB       = HIP_R_16BF;
+    hipDataType            typeD       = HIP_R_16BF;
+    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
+   **********************/
+
+    std::vector<int> modeD{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeD = modeD.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 5;
+    extent['n'] = 6;
+    extent['u'] = 3;
+    extent['v'] = 4;
+    extent['h'] = 3;
+    extent['k'] = 4;
+
+    std::vector<int64_t> d_ms_ns_lengths;
+    for(auto mode : modeD)
+    {
+        d_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t d_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &d_ms_ns,
+                                                        nmodeD,
+                                                        d_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeD,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsD = std::accumulate(
+        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeD = sizeof(DDataType) * elementsD;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    DDataType* D = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
+
+    void *A_d, *B_d, *D_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
+
+    /*******************
+   * Initialize data
+   *******************/
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsD; i++)
+    {
+        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementD;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             nullptr,
+                                                             nullptr,
+                                                             0,
+                                                             &d_ms_ns,
+                                                             modeD.data(),
+                                                             alignmentRequirementD,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               nullptr,
+                                               nullptr,
+                                               D_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, D, elementsD);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorD;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorD.open("tensor_D_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
+        tensorD.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(D);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(D_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16.cpp
new file mode 100644
index 00000000..1e62be85
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f16.cpp
@@ -0,0 +1,334 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    typedef _Float16 ADataType;
+    typedef _Float16 BDataType;
+    typedef _Float16 DDataType;
+    typedef float    floatTypeCompute;
+
+    hipDataType            typeA       = HIP_R_16F;
+    hipDataType            typeB       = HIP_R_16F;
+    hipDataType            typeD       = HIP_R_16F;
+    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
+   **********************/
+
+    std::vector<int> modeD{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeD = modeD.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 5;
+    extent['n'] = 6;
+    extent['u'] = 3;
+    extent['v'] = 4;
+    extent['h'] = 3;
+    extent['k'] = 4;
+
+    std::vector<int64_t> d_ms_ns_lengths;
+    for(auto mode : modeD)
+    {
+        d_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t d_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &d_ms_ns,
+                                                        nmodeD,
+                                                        d_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeD,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsD = std::accumulate(
+        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeD = sizeof(DDataType) * elementsD;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    DDataType* D = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
+
+    void *A_d, *B_d, *D_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
+
+    /*******************
+   * Initialize data
+   *******************/
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+    }
+
+    for(int64_t i = 0; i < elementsD; i++)
+    {
+        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementD;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             nullptr,
+                                                             nullptr,
+                                                             0,
+                                                             &d_ms_ns,
+                                                             modeD.data(),
+                                                             alignmentRequirementD,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               nullptr,
+                                               nullptr,
+                                               D_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, D, elementsD);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorD;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorD.open("tensor_D_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
+        tensorD.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(D);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(D_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index 2bd90e90..a08065a0 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -1,6 +1,8 @@
 ---
 Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
+  - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
+  - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
   - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F]
   - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F]
 Algorithm Types:
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index 329f1b84..b28e9a88 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -1,6 +1,8 @@
 ---
 Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
+  - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
+  - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
   - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
 Algorithm Types:
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 5d745d12..9446157f 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -56,7 +56,8 @@ namespace hiptensor
     // False = skip test
     bool ContractionTest::checkDevice(hipDataType datatype) const
     {
-        return (isF32Supported() && datatype == HIP_R_32F)
+        return (isF32Supported()
+                && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF))
                || (isF64Supported() && datatype == HIP_R_64F);
     }
 
@@ -115,11 +116,15 @@ namespace hiptensor
         auto CDataType = testType[2];
         auto DDataType = testType[3];
 
-        EXPECT_TRUE((ADataType == HIP_R_32F) || (ADataType == HIP_R_64F));
-        EXPECT_TRUE((BDataType == HIP_R_32F) || (BDataType == HIP_R_64F));
-        EXPECT_TRUE((CDataType == HIP_R_32F) || (CDataType == HIP_R_64F)
+        EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF)
+                    || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F));
+        EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF)
+                    || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F));
+        EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF)
+                    || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F)
                     || (CDataType == NONE_TYPE));
-        EXPECT_TRUE((DDataType == HIP_R_32F) || (DDataType == HIP_R_64F));
+        EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF)
+                    || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F));
 
         mRunFlag &= checkDevice(DDataType);
 
@@ -228,7 +233,35 @@ namespace hiptensor
             auto resource = getResource();
             resource->resizeStorage(lengths, elementBytes);
 
-            if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F)
+            if(ADataType == HIP_R_16F && BDataType == HIP_R_16F && DDataType == HIP_R_16F)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<_Float16>((_Float16*)resource->deviceA().get(), elementsA);
+                fillLaunchKernel<_Float16>((_Float16*)resource->deviceB().get(), elementsB);
+                if(CDataType == HIP_R_16F)
+                {
+                    fillLaunchKernel<_Float16>((_Float16*)resource->deviceC().get(), elementsCD);
+                }
+                fillValLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(),
+                                              elementsCD,
+                                              std::numeric_limits<_Float16>::signaling_NaN());
+            }
+            else if(ADataType == HIP_R_16BF && BDataType == HIP_R_16BF && DDataType == HIP_R_16BF)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceA().get(), elementsA);
+                fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceB().get(), elementsB);
+                if(CDataType == HIP_R_16BF)
+                {
+                    fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceC().get(),
+                                                   elementsCD);
+                }
+                fillValLaunchKernel<hip_bfloat16>(
+                    (hip_bfloat16*)resource->deviceD().get(),
+                    elementsCD,
+                    std::numeric_limits<hip_bfloat16>::signaling_NaN());
+            }
+            else if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F)
             {
                 // Initialize matrix data on device
                 fillLaunchKernel<float>((float*)resource->deviceA().get(), elementsA);
@@ -328,7 +361,7 @@ namespace hiptensor
             {
                 auto resource = getResource();
 
-                int size = ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double));
+                int size = hipDataTypeSize(DDataType);
 
                 size_t elementsA  = std::accumulate(a_ms_ks.mLengths.begin(),
                                                    a_ms_ks.mLengths.end(),
@@ -346,7 +379,50 @@ namespace hiptensor
                 auto D = resource->allocHost(elementsCD * size);
                 resource->copyData(D, resource->deviceD(), elementsCD * size);
 
-                if(DDataType == HIP_R_32F)
+                if(DDataType == HIP_R_16F)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(stream, (_Float16*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
+                else if(DDataType == HIP_R_16BF)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
+                else if(DDataType == HIP_R_32F)
                 {
                     stream << "Tensor A elements:\n";
                     hiptensorPrintArrayElements<float>(
@@ -456,11 +532,24 @@ namespace hiptensor
                                                 size_t{1},
                                                 std::multiplies<size_t>());
 
-            int  sizeD = elementsCD * ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double));
+            int  sizeD     = elementsCD * hipDataTypeSize(DDataType);
             auto reference = resource->allocDevice(sizeD);
             resource->copyData(reference, resource->hostD(), sizeD);
 
-            if(DDataType == HIP_R_32F)
+            if(DDataType == HIP_R_16F)
+            {
+                std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>(
+                    (_Float16*)resource->deviceD().get(), (_Float16*)reference.get(), elementsCD);
+            }
+            else if(DDataType == HIP_R_16BF)
+            {
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<hip_bfloat16>(
+                        (hip_bfloat16*)resource->deviceD().get(),
+                        (hip_bfloat16*)reference.get(),
+                        elementsCD);
+            }
+            else if(DDataType == HIP_R_32F)
             {
                 std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<float>(
                     (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD);
diff --git a/test/device/common.hpp b/test/device/common.hpp
index f961abc1..172e6953 100644
--- a/test/device/common.hpp
+++ b/test/device/common.hpp
@@ -72,7 +72,7 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed)
 
     if(index < elementSize)
     {
-        auto value  = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize;
+        auto value  = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize;
         data[index] = static_cast<DataType>(value);
     }
 }
diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp
index 46f4c43e..cd3eb46f 100644
--- a/test/llvm/yaml_parser_config.cpp
+++ b/test/llvm/yaml_parser_config.cpp
@@ -110,6 +110,7 @@ namespace llvm
             static void enumeration(IO& io, hipDataType& value)
             {
                 io.enumCase(value, "HIP_R_16F", HIP_R_16F);
+                io.enumCase(value, "HIP_R_16BF", HIP_R_16BF);
                 io.enumCase(value, "HIP_R_32F", HIP_R_32F);
                 io.enumCase(value, "HIP_R_64F", HIP_R_64F);
                 io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE);
diff --git a/test/utils.hpp b/test/utils.hpp
index 1f7ece44..ad4bb565 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -41,9 +41,9 @@
 #include <hiptensor/hiptensor.hpp>
 #include <hiptensor/hiptensor_types.hpp>
 #include <hiptensor/internal/hiptensor_utility.hpp>
+#include <hiptensor/internal/types.hpp>
 
 #include "device/common.hpp"
-#include "types.hpp"
 
 #define HIPTENSOR_FREE_DEVICE(ptr)     \
     if(ptr != nullptr)                 \

From 185a2ab115d4e6e8999917e349ca1a4d803e5228 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Sat, 25 Nov 2023 02:38:48 +0000
Subject: [PATCH 09/42] Add support to f32_f16, f32_bf16, f64_f32 to
 contraction

- Support ABCD data type f32 and compute type f16, bf16
- Support ABCD data type f64 and compute type f32
- Fixed bug: alpha, beta were passed in as wrong data type in unit test
of contraction
- Create sample template of contraction
---
 .../contraction/contraction_cpu_reference.cpp |  48 +-
 .../contraction/contraction_cpu_reference.hpp |  39 +-
 .../contraction_cpu_reference_instances.cpp   | 104 ++-
 .../contraction/contraction_meta_traits.hpp   |  18 +-
 .../src/contraction/contraction_selection.cpp | 737 ++++++++++++++----
 .../src/contraction/contraction_selection.hpp |   9 +-
 .../contraction/contraction_solution_impl.hpp |   9 +-
 .../contraction_solution_instances.cpp        |  88 ++-
 .../contraction_solution_params.hpp           |   9 +-
 .../contraction_solution_params_impl.hpp      |   6 +
 .../contraction_solution_registry.cpp         |  81 +-
 .../contraction_solution_registry.hpp         |  60 +-
 library/src/contraction/device/CMakeLists.txt |  28 +-
 ...16_bf16_bf16_compute_f32_kknn_instance.cpp |  27 +-
 ...16_bf16_bf16_compute_f32_knnn_instance.cpp |  27 +-
 ...16_bf16_bf16_compute_f32_mknn_instance.cpp |  27 +-
 ...16_bf16_bf16_compute_f32_mnnn_instance.cpp |  27 +-
 ..._f16_f16_f16_compute_f32_kknn_instance.cpp |  27 +-
 ..._f16_f16_f16_compute_f32_knnn_instance.cpp |  27 +-
 ..._f16_f16_f16_compute_f32_mknn_instance.cpp |  27 +-
 ..._f16_f16_f16_compute_f32_mnnn_instance.cpp |  27 +-
 ...f32_f32_f32_compute_bf16_kknn_instance.cpp |  85 ++
 ...f32_f32_f32_compute_bf16_knnn_instance.cpp |  85 ++
 ...f32_f32_f32_compute_bf16_mknn_instance.cpp |  85 ++
 ...f32_f32_f32_compute_bf16_mnnn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_f16_kknn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_f16_knnn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_f16_mknn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_f16_mnnn_instance.cpp |  85 ++
 ..._f64_f64_f64_compute_f32_kknn_instance.cpp |  85 ++
 ..._f64_f64_f64_compute_f32_knnn_instance.cpp |  85 ++
 ..._f64_f64_f64_compute_f32_mknn_instance.cpp |  85 ++
 ..._f64_f64_f64_compute_f32_mnnn_instance.cpp |  85 ++
 ...f16_bf16_bf16_compute_f32_kkn_instance.cpp |  27 +-
 ...f16_bf16_bf16_compute_f32_knn_instance.cpp |  27 +-
 ...f16_bf16_bf16_compute_f32_mkn_instance.cpp |  27 +-
 ...f16_bf16_bf16_compute_f32_mnn_instance.cpp |  27 +-
 ...e_f16_f16_f16_compute_f32_kkn_instance.cpp |  27 +-
 ...e_f16_f16_f16_compute_f32_knn_instance.cpp |  27 +-
 ...e_f16_f16_f16_compute_f32_mkn_instance.cpp |  27 +-
 ...e_f16_f16_f16_compute_f32_mnn_instance.cpp |  27 +-
 ..._f32_f32_f32_compute_bf16_kkn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_bf16_knn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_bf16_mkn_instance.cpp |  85 ++
 ..._f32_f32_f32_compute_bf16_mnn_instance.cpp |  85 ++
 ...e_f32_f32_f32_compute_f16_kkn_instance.cpp |  85 ++
 ...e_f32_f32_f32_compute_f16_knn_instance.cpp |  85 ++
 ...e_f32_f32_f32_compute_f16_mkn_instance.cpp |  85 ++
 ...e_f32_f32_f32_compute_f16_mnn_instance.cpp |  85 ++
 ...e_f64_f64_f64_compute_f32_kkn_instance.cpp |  62 ++
 ...e_f64_f64_f64_compute_f32_knn_instance.cpp |  62 ++
 ...e_f64_f64_f64_compute_f32_mkn_instance.cpp |  62 ++
 ...e_f64_f64_f64_compute_f32_mnn_instance.cpp |  62 ++
 .../src/contraction/hiptensor_contraction.cpp |  59 +-
 library/src/data_types.cpp                    |  43 +
 library/src/include/data_types.hpp            |   2 +
 samples/01_contraction/CMakeLists.txt         |  31 +
 .../simple_bilinear_contraction.hpp           | 351 +++++++++
 .../simple_bilinear_contraction_bf16.cpp      | 313 +-------
 .../simple_bilinear_contraction_f16.cpp       | 313 +-------
 .../simple_bilinear_contraction_f32.cpp       | 313 +-------
 .../simple_bilinear_contraction_f32_bf16.cpp  |  57 ++
 .../simple_bilinear_contraction_f32_f16.cpp   |  57 ++
 .../simple_bilinear_contraction_f64.cpp       |  57 ++
 .../simple_bilinear_contraction_f64_f32.cpp   |  57 ++
 .../simple_scale_contraction.hpp              | 341 ++++++++
 .../simple_scale_contraction_bf16.cpp         | 311 +-------
 .../simple_scale_contraction_f16.cpp          | 317 +-------
 .../simple_scale_contraction_f32.cpp          | 310 +-------
 .../simple_scale_contraction_f32_bf16.cpp     |  58 ++
 .../simple_scale_contraction_f32_f16.cpp      |  58 ++
 .../simple_scale_contraction_f64.cpp          |  57 ++
 .../simple_scale_contraction_f64_f32.cpp      |  57 ++
 .../configs/bilinear_test_params.yaml         |   7 +-
 .../configs/scale_test_params.yaml            |   3 +
 test/01_contraction/contraction_test.cpp      |  18 +-
 76 files changed, 4657 insertions(+), 2149 deletions(-)
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction.hpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f64.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction.hpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f32_bf16.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f32_f16.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f64.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_f64_f32.cpp

diff --git a/library/src/contraction/contraction_cpu_reference.cpp b/library/src/contraction/contraction_cpu_reference.cpp
index 13dcdffd..ac1d9711 100644
--- a/library/src/contraction/contraction_cpu_reference.cpp
+++ b/library/src/contraction/contraction_cpu_reference.cpp
@@ -28,31 +28,33 @@
 #include "contraction_cpu_reference_impl.hpp"
 #include "contraction_cpu_reference_instances.hpp"
 
-hiptensorStatus_t hiptensorContractionReference(void const*                alpha,
-                                                void const*                A,
-                                                void const*                B,
-                                                void const*                beta,
-                                                void const*                C,
-                                                void*                      D,
-                                                std::vector<size_t> const& a_ms_ks_lengths,
-                                                std::vector<size_t> const& a_ms_ks_strides,
-                                                std::vector<size_t> const& b_ns_ks_lengths,
-                                                std::vector<size_t> const& b_ns_ks_strides,
-                                                std::vector<size_t> const& c_ms_ns_lengths,
-                                                std::vector<size_t> const& c_ms_ns_strides,
-                                                std::vector<size_t> const& d_ms_ns_lengths,
-                                                std::vector<size_t> const& d_ms_ns_strides,
-                                                hipDataType                typeA,
-                                                hipDataType                typeB,
-                                                hipDataType                typeC,
-                                                hipDataType                typeD,
-                                                void*                      workspace)
+hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan,
+                                                void const*                       alpha,
+                                                void const*                       A,
+                                                void const*                       B,
+                                                void const*                       beta,
+                                                void const*                       C,
+                                                void*                             D,
+                                                std::vector<size_t> const&        a_ms_ks_lengths,
+                                                std::vector<size_t> const&        a_ms_ks_strides,
+                                                std::vector<size_t> const&        b_ns_ks_lengths,
+                                                std::vector<size_t> const&        b_ns_ks_strides,
+                                                std::vector<size_t> const&        c_ms_ns_lengths,
+                                                std::vector<size_t> const&        c_ms_ns_strides,
+                                                std::vector<size_t> const&        d_ms_ns_lengths,
+                                                std::vector<size_t> const&        d_ms_ns_strides,
+                                                hipDataType                       typeA,
+                                                hipDataType                       typeB,
+                                                hipDataType                       typeC,
+                                                hipDataType                       typeD,
+                                                void*                             workspace)
 {
-    auto& instances = hiptensor::ContractionCpuReferenceInstances::instance();
+    auto& instances   = hiptensor::ContractionCpuReferenceInstances::instance();
+    auto  computeType = plan->mContractionDesc.mComputeType;
     auto  candidates
-        = (C == nullptr)
-              ? instances->allSolutions().query(typeA, typeB, hiptensor::NONE_TYPE, typeD)
-              : instances->allSolutions().query(typeA, typeB, typeC, typeD);
+        = (C == nullptr) ? instances->allSolutions().query(
+              typeA, typeB, hiptensor::NONE_TYPE, typeD, computeType)
+                         : instances->allSolutions().query(typeA, typeB, typeC, typeD, computeType);
 
     auto toCKVec
         = [](auto& inputVec) { return std::vector<ck::index_t>(inputVec.begin(), inputVec.end()); };
diff --git a/library/src/contraction/contraction_cpu_reference.hpp b/library/src/contraction/contraction_cpu_reference.hpp
index aadb062e..471026dc 100644
--- a/library/src/contraction/contraction_cpu_reference.hpp
+++ b/library/src/contraction/contraction_cpu_reference.hpp
@@ -32,24 +32,25 @@
 
 #include <hiptensor/hiptensor.hpp>
 
-hiptensorStatus_t hiptensorContractionReference(void const*                alpha,
-                                                void const*                A,
-                                                void const*                B,
-                                                void const*                beta,
-                                                void const*                C,
-                                                void*                      D,
-                                                std::vector<size_t> const& a_ms_ks_lengths,
-                                                std::vector<size_t> const& a_ms_ks_strides,
-                                                std::vector<size_t> const& b_ks_ns_lengths,
-                                                std::vector<size_t> const& b_ks_ns_strides,
-                                                std::vector<size_t> const& c_ms_ns_lengths,
-                                                std::vector<size_t> const& c_ms_ns_strides,
-                                                std::vector<size_t> const& d_ms_ns_lengths,
-                                                std::vector<size_t> const& d_ms_ns_strides,
-                                                hipDataType                typeA,
-                                                hipDataType                typeB,
-                                                hipDataType                typeC,
-                                                hipDataType                typeD,
-                                                void*                      workspace);
+hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan,
+                                                void const*                       alpha,
+                                                void const*                       A,
+                                                void const*                       B,
+                                                void const*                       beta,
+                                                void const*                       C,
+                                                void*                             D,
+                                                std::vector<size_t> const&        a_ms_ks_lengths,
+                                                std::vector<size_t> const&        a_ms_ks_strides,
+                                                std::vector<size_t> const&        b_ks_ns_lengths,
+                                                std::vector<size_t> const&        b_ks_ns_strides,
+                                                std::vector<size_t> const&        c_ms_ns_lengths,
+                                                std::vector<size_t> const&        c_ms_ns_strides,
+                                                std::vector<size_t> const&        d_ms_ns_lengths,
+                                                std::vector<size_t> const&        d_ms_ns_strides,
+                                                hipDataType                       typeA,
+                                                hipDataType                       typeB,
+                                                hipDataType                       typeC,
+                                                hipDataType                       typeD,
+                                                void*                             workspace);
 
 #endif // HIPTENSOR_CONTRACTION_CPU_REFERENCE_HPP
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 146d2721..173a49e9 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -37,10 +37,10 @@ namespace hiptensor
             enumerateReferenceSolutions<2,
                                         2,
                                         2,
-                                        _Float16,
-                                        _Float16,
-                                        ck::Tuple<_Float16>,
-                                        _Float16,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        ck::Tuple<ck::half_t>,
+                                        ck::half_t,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::Bilinear,
@@ -71,7 +71,34 @@ namespace hiptensor
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear>());
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        ck::half_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        ck::Tuple<float>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        ck::bhalf_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        ck::Tuple<float>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
 
         // Bilinear f64
         registerSolutions(
@@ -84,17 +111,31 @@ namespace hiptensor
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear>());
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        double,
+                                        double,
+                                        ck::Tuple<double>,
+                                        double,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        double>());
 
         // Scale f16
         registerSolutions(
             enumerateReferenceSolutions<2,
                                         2,
                                         2,
-                                        _Float16,
-                                        _Float16,
+                                        ck::half_t,
+                                        ck::half_t,
                                         ck::Tuple<>,
-                                        _Float16,
+                                        ck::half_t,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::Scale,
@@ -125,7 +166,34 @@ namespace hiptensor
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale>());
+                                        ck::tensor_operation::element_wise::Scale,
+                                        ck::half_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        ck::Tuple<>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        ck::bhalf_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        ck::Tuple<>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
 
         // Scale f64
         registerSolutions(
@@ -138,6 +206,20 @@ namespace hiptensor
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale>());
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        double,
+                                        double,
+                                        ck::Tuple<>,
+                                        double,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        double>());
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index ab158f96..6a7cb35f 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -75,9 +75,12 @@ namespace hiptensor
             = std::conditional_t<std::is_same_v<DsDataType, ck::bhalf_t>, hip_bfloat16, DsDataType>;
         using EDataT
             = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
-        using AOp   = AElementwiseOperation;
-        using BOp   = BElementwiseOperation;
-        using CDEOp = ck::tensor_operation::element_wise::Bilinear;
+        using ComputeDataT = std::conditional_t<std::is_same_v<ComputeDataType, ck::bhalf_t>,
+                                                hip_bfloat16,
+                                                ComputeDataType>;
+        using AOp          = AElementwiseOperation;
+        using BOp          = BElementwiseOperation;
+        using CDEOp        = ck::tensor_operation::element_wise::Bilinear;
     };
 
     // Partial specialize for Scale contraction
@@ -113,9 +116,12 @@ namespace hiptensor
         using DDataT = NoneType;
         using EDataT
             = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
-        using AOp   = AElementwiseOperation;
-        using BOp   = BElementwiseOperation;
-        using CDEOp = ck::tensor_operation::element_wise::Scale;
+        using ComputeDataT = std::conditional_t<std::is_same_v<ComputeDataType, ck::bhalf_t>,
+                                                hip_bfloat16,
+                                                ComputeDataType>;
+        using AOp          = AElementwiseOperation;
+        using BOp          = BElementwiseOperation;
+        using CDEOp        = ck::tensor_operation::element_wise::Scale;
     };
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index 1b2cf92e..888ef4c1 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -54,6 +54,7 @@ namespace hiptensor
                                       hipDataType                              typeE,
                                       std::vector<std::size_t> const&          e_ms_ns_lengths,
                                       std::vector<std::size_t> const&          e_ms_ns_strides,
+                                      hiptensorComputeType_t                   computeType,
                                       const uint64_t                           workspaceSize)
     {
         // Make sure that we calculate full element space incase strides are not packed.
@@ -70,9 +71,11 @@ namespace hiptensor
         auto sizeE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides)
                      * hipDataTypeSize(typeE);
 
-        void *A_d, *B_d, *D_d, *E_d, *wspace;
-        float alpha = 1.02f;
-        float beta  = 1.03f;
+        void * A_d, *B_d, *D_d, *E_d, *wspace;
+        double alpha = 0.0d;
+        double beta  = 0.0d;
+        writeVal(&alpha, computeType, 1.02);
+        writeVal(&beta, computeType, 1.03);
 
         CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA));
         CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB));
@@ -150,9 +153,13 @@ namespace hiptensor
         }
     }
 
-    // test
     template <>
-    struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::SCALE>
+    struct ActorCriticSelection<_Float16,
+                                _Float16,
+                                _Float16,
+                                _Float16,
+                                ContractionOpId_t::SCALE,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -196,7 +203,12 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::BILINEAR>
+    struct ActorCriticSelection<_Float16,
+                                _Float16,
+                                _Float16,
+                                _Float16,
+                                ContractionOpId_t::BILINEAR,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -224,7 +236,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // select unique_id
+            // TODO select unique_id
             unique_id = 7255639152084218514;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
@@ -244,7 +256,8 @@ namespace hiptensor
                                 hip_bfloat16,
                                 hip_bfloat16,
                                 hip_bfloat16,
-                                ContractionOpId_t::SCALE>
+                                ContractionOpId_t::SCALE,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -292,7 +305,8 @@ namespace hiptensor
                                 hip_bfloat16,
                                 hip_bfloat16,
                                 hip_bfloat16,
-                                ContractionOpId_t::BILINEAR>
+                                ContractionOpId_t::BILINEAR,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -320,7 +334,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // select unique_id
+            // TODO select unique_id
             unique_id = 8689089455041651212;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
@@ -334,10 +348,183 @@ namespace hiptensor
             }
         }
     };
-    // end test
 
     template <>
-    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, _Float16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR, _Float16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, hip_bfloat16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float,
+                                float,
+                                float,
+                                float,
+                                ContractionOpId_t::BILINEAR,
+                                hip_bfloat16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -702,7 +889,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR, float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1060,7 +1247,92 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            // TODO select unique_id
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, double>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1335,7 +1607,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, double>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1602,181 +1874,344 @@ namespace hiptensor
                          hipDataType                                             typeE,
                          std::vector<std::size_t> const&                         e_ms_ns_lengths,
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         hiptensorComputeType_t                                  computeType,
                          const uint64_t                                          workspaceSize)
     {
-        if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F)
+        if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F
+           && computeType == HIP_R_32F)
         {
             return ActorCriticSelection<_Float16,
                                         _Float16,
                                         _Float16,
                                         _Float16,
-                                        ContractionOpId_t::SCALE>::selectWinner(winner,
-                                                                                candidates,
-                                                                                typeA,
-                                                                                a_ms_ks_lengths,
-                                                                                a_ms_ks_strides,
-                                                                                typeB,
-                                                                                b_ns_ks_lengths,
-                                                                                b_ns_ks_strides,
-                                                                                typeD,
-                                                                                d_ms_ns_lengths,
-                                                                                d_ms_ns_strides,
-                                                                                typeE,
-                                                                                e_ms_ns_lengths,
-                                                                                e_ms_ns_strides,
-                                                                                workspaceSize);
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
-        else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F
-                && typeE == HIP_R_16F)
+        else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F
+                && computeType == HIP_R_32F)
         {
             return ActorCriticSelection<_Float16,
                                         _Float16,
                                         _Float16,
                                         _Float16,
-                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
-                                                                                   candidates,
-                                                                                   typeA,
-                                                                                   a_ms_ks_lengths,
-                                                                                   a_ms_ks_strides,
-                                                                                   typeB,
-                                                                                   b_ns_ks_lengths,
-                                                                                   b_ns_ks_strides,
-                                                                                   typeD,
-                                                                                   d_ms_ns_lengths,
-                                                                                   d_ms_ns_strides,
-                                                                                   typeE,
-                                                                                   e_ms_ns_lengths,
-                                                                                   e_ms_ns_strides,
-                                                                                   workspaceSize);
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
         else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE
-                && typeE == HIP_R_16BF)
+                && typeE == HIP_R_16BF && computeType == HIP_R_32F)
         {
             return ActorCriticSelection<hip_bfloat16,
                                         hip_bfloat16,
                                         hip_bfloat16,
                                         hip_bfloat16,
-                                        ContractionOpId_t::SCALE>::selectWinner(winner,
-                                                                                candidates,
-                                                                                typeA,
-                                                                                a_ms_ks_lengths,
-                                                                                a_ms_ks_strides,
-                                                                                typeB,
-                                                                                b_ns_ks_lengths,
-                                                                                b_ns_ks_strides,
-                                                                                typeD,
-                                                                                d_ms_ns_lengths,
-                                                                                d_ms_ns_strides,
-                                                                                typeE,
-                                                                                e_ms_ns_lengths,
-                                                                                e_ms_ns_strides,
-                                                                                workspaceSize);
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
         else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF
-                && typeE == HIP_R_16BF)
+                && typeE == HIP_R_16BF && computeType == HIP_R_32F)
         {
             return ActorCriticSelection<hip_bfloat16,
                                         hip_bfloat16,
                                         hip_bfloat16,
                                         hip_bfloat16,
-                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
-                                                                                   candidates,
-                                                                                   typeA,
-                                                                                   a_ms_ks_lengths,
-                                                                                   a_ms_ks_strides,
-                                                                                   typeB,
-                                                                                   b_ns_ks_lengths,
-                                                                                   b_ns_ks_strides,
-                                                                                   typeD,
-                                                                                   d_ms_ns_lengths,
-                                                                                   d_ms_ns_strides,
-                                                                                   typeE,
-                                                                                   e_ms_ns_lengths,
-                                                                                   e_ms_ns_strides,
-                                                                                   workspaceSize);
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIP_R_16F)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        _Float16>::selectWinner(winner,
+                                                                candidates,
+                                                                typeA,
+                                                                a_ms_ks_lengths,
+                                                                a_ms_ks_strides,
+                                                                typeB,
+                                                                b_ns_ks_lengths,
+                                                                b_ns_ks_strides,
+                                                                typeD,
+                                                                d_ms_ns_lengths,
+                                                                d_ms_ns_strides,
+                                                                typeE,
+                                                                e_ms_ns_lengths,
+                                                                e_ms_ns_strides,
+                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIP_R_16F)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        _Float16>::selectWinner(winner,
+                                                                candidates,
+                                                                typeA,
+                                                                a_ms_ks_lengths,
+                                                                a_ms_ks_strides,
+                                                                typeB,
+                                                                b_ns_ks_lengths,
+                                                                b_ns_ks_strides,
+                                                                typeD,
+                                                                d_ms_ns_lengths,
+                                                                d_ms_ns_strides,
+                                                                typeE,
+                                                                e_ms_ns_lengths,
+                                                                e_ms_ns_strides,
+                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIP_R_16BF)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        hip_bfloat16>::selectWinner(winner,
+                                                                    candidates,
+                                                                    typeA,
+                                                                    a_ms_ks_lengths,
+                                                                    a_ms_ks_strides,
+                                                                    typeB,
+                                                                    b_ns_ks_lengths,
+                                                                    b_ns_ks_strides,
+                                                                    typeD,
+                                                                    d_ms_ns_lengths,
+                                                                    d_ms_ns_strides,
+                                                                    typeE,
+                                                                    e_ms_ns_lengths,
+                                                                    e_ms_ns_strides,
+                                                                    workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIP_R_16BF)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        hip_bfloat16>::selectWinner(winner,
+                                                                    candidates,
+                                                                    typeA,
+                                                                    a_ms_ks_lengths,
+                                                                    a_ms_ks_strides,
+                                                                    typeB,
+                                                                    b_ns_ks_lengths,
+                                                                    b_ns_ks_strides,
+                                                                    typeD,
+                                                                    d_ms_ns_lengths,
+                                                                    d_ms_ns_strides,
+                                                                    typeE,
+                                                                    e_ms_ns_lengths,
+                                                                    e_ms_ns_strides,
+                                                                    workspaceSize);
         }
-        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE
-                && typeE == HIP_R_32F)
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIP_R_32F)
         {
-            return ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
-        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F
-                && typeE == HIP_R_32F)
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIP_R_32F)
         {
-            return ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
         }
-        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE
-                && typeE == HIP_R_64F)
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F
+                && computeType == HIP_R_32F)
         {
-            return ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F
+                && computeType == HIP_R_32F)
+        {
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F
+                && computeType == HIP_R_64F)
+        {
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::SCALE,
+                                        double>::selectWinner(winner,
+                                                              candidates,
+                                                              typeA,
+                                                              a_ms_ks_lengths,
+                                                              a_ms_ks_strides,
+                                                              typeB,
+                                                              b_ns_ks_lengths,
+                                                              b_ns_ks_strides,
+                                                              typeD,
+                                                              d_ms_ns_lengths,
+                                                              d_ms_ns_strides,
+                                                              typeE,
+                                                              e_ms_ns_lengths,
+                                                              e_ms_ns_strides,
+                                                              workspaceSize);
         }
-        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F
-                && typeE == HIP_R_64F)
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F
+                && computeType == HIP_R_64F)
         {
             return ActorCriticSelection<double,
                                         double,
                                         double,
                                         double,
-                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
-                                                                                   candidates,
-                                                                                   typeA,
-                                                                                   a_ms_ks_lengths,
-                                                                                   a_ms_ks_strides,
-                                                                                   typeB,
-                                                                                   b_ns_ks_lengths,
-                                                                                   b_ns_ks_strides,
-                                                                                   typeD,
-                                                                                   d_ms_ns_lengths,
-                                                                                   d_ms_ns_strides,
-                                                                                   typeE,
-                                                                                   e_ms_ns_lengths,
-                                                                                   e_ms_ns_strides,
-                                                                                   workspaceSize);
+                                        ContractionOpId_t::BILINEAR,
+                                        double>::selectWinner(winner,
+                                                              candidates,
+                                                              typeA,
+                                                              a_ms_ks_lengths,
+                                                              a_ms_ks_strides,
+                                                              typeB,
+                                                              b_ns_ks_lengths,
+                                                              b_ns_ks_strides,
+                                                              typeD,
+                                                              d_ms_ns_lengths,
+                                                              d_ms_ns_strides,
+                                                              typeE,
+                                                              e_ms_ns_lengths,
+                                                              e_ms_ns_strides,
+                                                              workspaceSize);
         }
         return HIPTENSOR_STATUS_EXECUTION_FAILED;
     }
diff --git a/library/src/contraction/contraction_selection.hpp b/library/src/contraction/contraction_selection.hpp
index 9ceb6a14..deb980d9 100644
--- a/library/src/contraction/contraction_selection.hpp
+++ b/library/src/contraction/contraction_selection.hpp
@@ -49,9 +49,15 @@ namespace hiptensor
                                       hipDataType                              typeE,
                                       std::vector<std::size_t> const&          e_ms_ns_lengths,
                                       std::vector<std::size_t> const&          e_ms_ns_strides,
+                                      hiptensorComputeType_t                   computeType,
                                       const uint64_t                           workspaceSize);
 
-    template <typename A, typename B, typename C, typename D, ContractionOpId_t ContractionOp>
+    template <typename A,
+              typename B,
+              typename C,
+              typename D,
+              ContractionOpId_t ContractionOp,
+              typename ComputeType>
     struct ActorCriticSelection
     {
         static hiptensorStatus_t
@@ -87,6 +93,7 @@ namespace hiptensor
                          hipDataType                                             typeE,
                          std::vector<std::size_t> const&                         e_ms_ns_lengths,
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         hiptensorComputeType_t                                  computeType,
                          const uint64_t                                          workspaceSize);
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp
index 5e191441..3b672fbb 100644
--- a/library/src/contraction/contraction_solution_impl.hpp
+++ b/library/src/contraction/contraction_solution_impl.hpp
@@ -95,11 +95,13 @@ namespace hiptensor
 
             if(alpha != nullptr)
             {
-                alphaF = hiptensor::readVal<float>(alpha, HipDataType_v<typename Traits::EDataT>);
+                alphaF = hiptensor::readVal<float>(
+                    alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
             if(beta != nullptr)
             {
-                betaF = hiptensor::readVal<float>(beta, HipDataType_v<typename Traits::EDataT>);
+                betaF = hiptensor::readVal<float>(
+                    beta, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
 
             // CK has its own format for indices...
@@ -205,7 +207,8 @@ namespace hiptensor
 
             if(alpha != nullptr)
             {
-                alphaF = hiptensor::readVal<float>(alpha, HipDataType_v<typename Traits::EDataT>);
+                alphaF = hiptensor::readVal<float>(
+                    alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
 
             // CK has its own format for indices...
diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index 6d481577..aec12e32 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -72,7 +72,34 @@ namespace hiptensor
                                           float,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear>());
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<float>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          ck::half_t>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<float>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          ck::bhalf_t>());
 
         // Bilinear f64
         registerSolutions(
@@ -85,7 +112,20 @@ namespace hiptensor
                                           double,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear>());
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          double,
+                                          double,
+                                          ck::Tuple<double>,
+                                          double,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          double>());
 
         // Scale bf16
         registerSolutions(
@@ -126,8 +166,34 @@ namespace hiptensor
                                           float,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale>());
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          ck::half_t>());
 
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          ck::bhalf_t>());
         // Scale f64
         registerSolutions(
             enumerateContractionSolutions<2,
@@ -139,6 +205,20 @@ namespace hiptensor
                                           double,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale>());
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          double,
+                                          double,
+                                          ck::Tuple<>,
+                                          double,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          double>());
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp
index ec9de45c..4c44de88 100644
--- a/library/src/contraction/contraction_solution_params.hpp
+++ b/library/src/contraction/contraction_solution_params.hpp
@@ -49,10 +49,11 @@ namespace hiptensor
         virtual int32_t dimsK() const = 0;
 
         // Map to hipDataType
-        virtual hipDataType typeA() const = 0;
-        virtual hipDataType typeB() const = 0;
-        virtual hipDataType typeC() const = 0;
-        virtual hipDataType typeD() const = 0;
+        virtual hipDataType            typeA() const       = 0;
+        virtual hipDataType            typeB() const       = 0;
+        virtual hipDataType            typeC() const       = 0;
+        virtual hipDataType            typeD() const       = 0;
+        virtual hiptensorComputeType_t typeCompute() const = 0;
 
         // Map to operators
         virtual hiptensorOperator_t opA() const   = 0;
diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp
index bff33960..b84f9c2b 100644
--- a/library/src/contraction/contraction_solution_params_impl.hpp
+++ b/library/src/contraction/contraction_solution_params_impl.hpp
@@ -42,6 +42,7 @@ namespace std
             return hiptensor::Hash{}(s.dimsM(),
                                      s.dimsN(),
                                      s.dimsK(),
+                                     s.typeCompute(),
                                      s.typeA(),
                                      s.typeB(),
                                      s.typeC(),
@@ -102,6 +103,11 @@ namespace hiptensor
             return HipDataType_v<typename MetaTraitsT::EDataT>;
         }
 
+        hiptensorComputeType_t typeCompute() const override
+        {
+            return convertToComputeType(HipDataType_v<typename MetaTraitsT::ComputeDataT>);
+        }
+
         hiptensorOperator_t opA() const override
         {
             return ElementWiseOperatorType_v<typename MetaTraitsT::AOp>;
diff --git a/library/src/contraction/contraction_solution_registry.cpp b/library/src/contraction/contraction_solution_registry.cpp
index 83674c81..9e2da1f9 100644
--- a/library/src/contraction/contraction_solution_registry.cpp
+++ b/library/src/contraction/contraction_solution_registry.cpp
@@ -53,19 +53,20 @@ namespace hiptensor
     }
 
     ContractionSolutionRegistry::Query
-        ContractionSolutionRegistry::Query::query(int32_t             dimsM,
-                                                  int32_t             dimsN,
-                                                  int32_t             dimsK,
-                                                  hipDataType         typeA,
-                                                  hipDataType         typeB,
-                                                  hipDataType         typeC,
-                                                  hipDataType         typeD,
-                                                  hiptensorOperator_t opA,
-                                                  hiptensorOperator_t opB,
-                                                  ContractionOpId_t   opCDE) const
+        ContractionSolutionRegistry::Query::query(int32_t                dimsM,
+                                                  int32_t                dimsN,
+                                                  int32_t                dimsK,
+                                                  hipDataType            typeA,
+                                                  hipDataType            typeB,
+                                                  hipDataType            typeC,
+                                                  hipDataType            typeD,
+                                                  hiptensorOperator_t    opA,
+                                                  hiptensorOperator_t    opB,
+                                                  ContractionOpId_t      opCDE,
+                                                  hiptensorComputeType_t typeCompute) const
     {
-        auto solutionHash
-            = hashSolution(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE);
+        auto solutionHash = hashSolution(
+            dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute);
 
         if(auto solutions = mSolutionHash.find(solutionHash); solutions != mSolutionHash.end())
         {
@@ -81,10 +82,14 @@ namespace hiptensor
         return query(hashDimsMNK(dimsM, dimsN, dimsK));
     }
 
-    ContractionSolutionRegistry::Query ContractionSolutionRegistry::Query::query(
-        hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) const
+    ContractionSolutionRegistry::Query
+        ContractionSolutionRegistry::Query::query(hipDataType            typeA,
+                                                  hipDataType            typeB,
+                                                  hipDataType            typeC,
+                                                  hipDataType            typeD,
+                                                  hiptensorComputeType_t typeCompute) const
     {
-        return query(hashTypesABCD(typeA, typeB, typeC, typeD));
+        return query(hashTypesComputeABCD(typeA, typeB, typeC, typeD, typeCompute));
     }
 
     ContractionSolutionRegistry::Query
@@ -159,18 +164,20 @@ namespace hiptensor
 
     /* static */
     ContractionSolutionRegistry::Query::HashId
-        ContractionSolutionRegistry::Query::hashSolution(int32_t             dimsM,
-                                                         int32_t             dimsN,
-                                                         int32_t             dimsK,
-                                                         hipDataType         typeA,
-                                                         hipDataType         typeB,
-                                                         hipDataType         typeC,
-                                                         hipDataType         typeD,
-                                                         hiptensorOperator_t opA,
-                                                         hiptensorOperator_t opB,
-                                                         ContractionOpId_t   opCDE)
+        ContractionSolutionRegistry::Query::hashSolution(int32_t                dimsM,
+                                                         int32_t                dimsN,
+                                                         int32_t                dimsK,
+                                                         hipDataType            typeA,
+                                                         hipDataType            typeB,
+                                                         hipDataType            typeC,
+                                                         hipDataType            typeD,
+                                                         hiptensorOperator_t    opA,
+                                                         hiptensorOperator_t    opB,
+                                                         ContractionOpId_t      opCDE,
+                                                         hiptensorComputeType_t typeCompute)
     {
-        return Hash{}(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE);
+        return Hash{}(
+            dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute);
     }
 
     /* static */
@@ -181,10 +188,14 @@ namespace hiptensor
     }
 
     /* static */
-    ContractionSolutionRegistry::Query::HashId ContractionSolutionRegistry::Query::hashTypesABCD(
-        hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD)
+    ContractionSolutionRegistry::Query::HashId
+        ContractionSolutionRegistry::Query::hashTypesComputeABCD(hipDataType            typeA,
+                                                                 hipDataType            typeB,
+                                                                 hipDataType            typeC,
+                                                                 hipDataType            typeD,
+                                                                 hiptensorComputeType_t typeCompute)
     {
-        return Hash{}(typeA, typeB, typeC, typeD);
+        return Hash{}(typeA, typeB, typeC, typeD, typeCompute);
     }
 
     /* static */
@@ -220,12 +231,16 @@ namespace hiptensor
                                              params->typeD(),
                                              params->opA(),
                                              params->opB(),
-                                             params->opCDE());
+                                             params->opCDE(),
+                                             params->typeCompute());
 
             auto dimsMNKHash = hashDimsMNK(params->dimsM(), params->dimsN(), params->dimsK());
 
-            auto typesABCDHash
-                = hashTypesABCD(params->typeA(), params->typeB(), params->typeC(), params->typeD());
+            auto typesComputeABCDHash = hashTypesComputeABCD(params->typeA(),
+                                                             params->typeB(),
+                                                             params->typeC(),
+                                                             params->typeD(),
+                                                             params->typeCompute());
 
             auto elementOpsHash = hashElementOps(params->opA(), params->opB());
 
@@ -236,7 +251,7 @@ namespace hiptensor
             mAllSolutions[solutionUid] = solution;
             mSolutionHash[solutionHash].push_back(solution);
             mSolutionHash[dimsMNKHash].push_back(solution);
-            mSolutionHash[typesABCDHash].push_back(solution);
+            mSolutionHash[typesComputeABCDHash].push_back(solution);
             mSolutionHash[elementOpsHash].push_back(solution);
             mSolutionHash[contactionOpsHash].push_back(solution);
         }
diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp
index d1b80ec5..44aaa97d 100644
--- a/library/src/contraction/contraction_solution_registry.hpp
+++ b/library/src/contraction/contraction_solution_registry.hpp
@@ -59,25 +59,27 @@ namespace hiptensor
             /// E.g. in this context, query further parameters.
 
             // By full solution type
-            Query query(int32_t             dimsM,
-                        int32_t             dimsN,
-                        int32_t             dimsK,
-                        hipDataType         typeA,
-                        hipDataType         typeB,
-                        hipDataType         typeC,
-                        hipDataType         typeD,
-                        hiptensorOperator_t opA,
-                        hiptensorOperator_t opB,
-                        ContractionOpId_t   opCDE) const;
+            Query query(int32_t                dimsM,
+                        int32_t                dimsN,
+                        int32_t                dimsK,
+                        hipDataType            typeA,
+                        hipDataType            typeB,
+                        hipDataType            typeC,
+                        hipDataType            typeD,
+                        hiptensorOperator_t    opA,
+                        hiptensorOperator_t    opB,
+                        ContractionOpId_t      opCDE,
+                        hiptensorComputeType_t typeCompute) const;
 
             // By dimensions
             Query query(int32_t dimsM, int32_t dimsN, int32_t dimsK) const;
 
             // By data types
-            Query query(hipDataType typeA,
-                        hipDataType typeB,
-                        hipDataType typeC,
-                        hipDataType typeD) const;
+            Query query(hipDataType            typeA,
+                        hipDataType            typeB,
+                        hipDataType            typeC,
+                        hipDataType            typeD,
+                        hiptensorComputeType_t typeCompute) const;
 
             // By element-wise operations
             Query query(hiptensorOperator_t opA, hiptensorOperator_t opB) const;
@@ -104,22 +106,24 @@ namespace hiptensor
             Query query(HashId queryHash) const;
 
             // Hashing helpers
-            static HashId hashSolution(int32_t             dimsM,
-                                       int32_t             dimsN,
-                                       int32_t             dimsK,
-                                       hipDataType         typeA,
-                                       hipDataType         typeB,
-                                       hipDataType         typeC,
-                                       hipDataType         typeD,
-                                       hiptensorOperator_t opA,
-                                       hiptensorOperator_t opB,
-                                       ContractionOpId_t   opCDE);
+            static HashId hashSolution(int32_t                dimsM,
+                                       int32_t                dimsN,
+                                       int32_t                dimsK,
+                                       hipDataType            typeA,
+                                       hipDataType            typeB,
+                                       hipDataType            typeC,
+                                       hipDataType            typeD,
+                                       hiptensorOperator_t    opA,
+                                       hiptensorOperator_t    opB,
+                                       ContractionOpId_t      opCDE,
+                                       hiptensorComputeType_t typeCompute);
 
             static HashId hashDimsMNK(int32_t dimsM, int32_t dimsN, int32_t dimsK);
-            static HashId hashTypesABCD(hipDataType typeA,
-                                        hipDataType typeB,
-                                        hipDataType typeC,
-                                        hipDataType typeD);
+            static HashId hashTypesComputeABCD(hipDataType            typeA,
+                                               hipDataType            typeB,
+                                               hipDataType            typeC,
+                                               hipDataType            typeD,
+                                               hiptensorComputeType_t typeCompute);
             static HashId hashElementOps(hiptensorOperator_t opA, hiptensorOperator_t opB);
             static HashId hashContractionOps(ContractionOpId_t opCDE);
 
diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt
index b9b382c0..eacac5b1 100644
--- a/library/src/contraction/device/CMakeLists.txt
+++ b/library/src/contraction/device/CMakeLists.txt
@@ -33,10 +33,22 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -49,15 +61,27 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
      )
 
- add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
- target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
+add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
+target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
index 7d777a83..3b3f6d47 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
index a9a97148..fd43f0ad 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
index d83d8d16..21fb8127 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
index bc49c82b..cc975c03 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
index a9d963ab..ff670630 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
index c139942e..be8bfe84 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
index 3c6ced30..4be69898 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
index 33c66296..2f6d630b 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
new file mode 100644
index 00000000..cc21216c
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
new file mode 100644
index 00000000..57c47457
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
new file mode 100644
index 00000000..a121fbb3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
new file mode 100644
index 00000000..7962da9f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
new file mode 100644
index 00000000..ea2be147
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
new file mode 100644
index 00000000..d82ea442
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
new file mode 100644
index 00000000..772df2e3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
new file mode 100644
index 00000000..8b1d0681
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..ad5ce461
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..ae3ee856
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..b72005ad
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..b94030e5
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
index 05400151..1da8301f 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
index bba95b14..82c17500 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
index fb5ecec0..1febb560 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
index 1dd6613c..02b9d719 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
index e98aee20..5917e466 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
index db8de1c0..216f470e 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
index 397ef327..3401b605 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
index 1f9221dc..fe2fa97d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
new file mode 100644
index 00000000..9a104075
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
new file mode 100644
index 00000000..6a7f565f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
new file mode 100644
index 00000000..094655bb
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
new file mode 100644
index 00000000..583b5b00
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
new file mode 100644
index 00000000..8eec79cf
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
new file mode 100644
index 00000000..a8999be8
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
new file mode 100644
index 00000000..e4e4b7de
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
new file mode 100644
index 00000000..a641f6e3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..04176d80
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..06481fc7
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..94922008
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..e70b854b
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index b96a204e..c7b7501b 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -242,17 +242,6 @@ hiptensorStatus_t hiptensorInitContractionFind(const hiptensorHandle_t*    handl
         auto& instances = hiptensor::ContractionSolutionInstances::instance();
         auto  solnQ     = instances->allSolutions();
 
-        // Check if the current device supports F64
-        if(!currentDevice.supportsF64())
-        {
-            // Allow only supported f32 combos
-            solnQ = solnQ.query(HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F) || // Bilinear F32
-                    solnQ.query(HIP_R_32F,
-                                HIP_R_32F,
-                                hipDataType(hiptensor::NONE_TYPE),
-                                HIP_R_32F); // Scale F32 (no C)
-        }
-
         // Can do more checking for scale / bilinear, etc. if we need to.
 
         if(solnQ.solutionCount() == 0)
@@ -461,15 +450,16 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
     // Convert to concrete contraction solutions
     auto candidates = toContractionSolutionVec(find->mCandidates);
 
-    auto ADataType = desc->mTensorDesc[0].mType;
-    auto BDataType = desc->mTensorDesc[1].mType;
-    auto DDataType = desc->mTensorDesc[2].mType;
-    auto EDataType = desc->mTensorDesc[3].mType;
+    auto computeType = desc->mComputeType;
+    auto ADataType   = desc->mTensorDesc[0].mType;
+    auto BDataType   = desc->mTensorDesc[1].mType;
+    auto DDataType   = desc->mTensorDesc[2].mType;
+    auto EDataType   = desc->mTensorDesc[3].mType;
 
     // Query contraction solutions for the correct contraction operation and type
     auto solutionQ = hiptensor::ContractionSolutionRegistry::Query{candidates}
                          .query((hiptensor::ContractionOpId_t)desc->mContractionOpId)
-                         .query(ADataType, BDataType, DDataType, EDataType);
+                         .query(ADataType, BDataType, DDataType, EDataType, computeType);
 
     candidates = toContractionSolutionVec(solutionQ.solutions());
 
@@ -500,6 +490,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
                                             EDataType,
                                             desc->mTensorDesc[3].mLengths,
                                             desc->mTensorDesc[3].mStrides,
+                                            desc->mComputeType,
                                             workspaceSize);
     }
     else if(find->mSelectionAlgorithm == HIPTENSOR_ALGO_ACTOR_CRITIC)
@@ -518,6 +509,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
                                              EDataType,
                                              desc->mTensorDesc[3].mLengths,
                                              desc->mTensorDesc[3].mStrides,
+                                             desc->mComputeType,
                                              workspaceSize);
     }
 
@@ -582,18 +574,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         }
         else
         {
-            if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F)
-            {
-                snprintf(
-                    alphaMsg, sizeof(alphaMsg), "alpha=%.6f", *(static_cast<const float*>(alpha)));
-            }
-            else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F)
-            {
-                snprintf(alphaMsg,
-                         sizeof(alphaMsg),
-                         "alpha=%.6lf",
-                         *(static_cast<const double*>(alpha)));
-            }
+            auto alphaValue
+                = hiptensor::readVal<double>(alpha, plan->mContractionDesc.mComputeType);
+            snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%.6lf", alphaValue);
         }
 
         if(beta == nullptr)
@@ -602,15 +585,8 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         }
         else
         {
-            if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F)
-            {
-                snprintf(betaMsg, sizeof(betaMsg), "beta=%.6f", *(static_cast<const float*>(beta)));
-            }
-            else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F)
-            {
-                snprintf(
-                    betaMsg, sizeof(betaMsg), "beta=%.6lf", *(static_cast<const double*>(beta)));
-            }
+            auto betaValue = hiptensor::readVal<double>(beta, plan->mContractionDesc.mComputeType);
+            snprintf(betaMsg, sizeof(betaMsg), "beta=%.6lf", betaValue);
         }
     }
     else
@@ -745,6 +721,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE)
         {
             auto time = (*cSolution)(StreamConfig{stream, true});
+            if(time < 0)
+            {
+                return HIPTENSOR_STATUS_CK_ERROR;
+            }
 
             int32_t m, n, k;
             std::tie(m, n, k) = cSolution->problemDims();
@@ -773,7 +753,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         // Perform contraction without timing
         else
         {
-            (*cSolution)(StreamConfig{stream, false});
+            if((*cSolution)(StreamConfig{stream, false}) < 0)
+            {
+                return HIPTENSOR_STATUS_CK_ERROR;
+            }
         }
 
         return HIPTENSOR_STATUS_SUCCESS;
diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp
index b270973d..38e9f186 100644
--- a/library/src/data_types.cpp
+++ b/library/src/data_types.cpp
@@ -132,6 +132,49 @@ namespace hiptensor
         }
     }
 
+    void writeVal(void const* addr, hiptensorComputeType_t id, double value)
+    {
+        if(id == HIPTENSOR_COMPUTE_16F)
+        {
+            *(_Float16*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_16BF)
+        {
+            *(hip_bfloat16*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32F)
+        {
+            *(float*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_64F)
+        {
+            *(double*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_8U)
+        {
+            *(uint8_t*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_8I)
+        {
+            *(int8_t*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32U)
+        {
+            *(uint32_t*)addr = value;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32I)
+        {
+            *(int32_t*)addr = value;
+        }
+        else
+        {
+#if !NDEBUG
+            std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
+#endif // !NDEBUG
+            return;
+        }
+    }
+
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType)
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
index 42197650..19ccca6c 100644
--- a/library/src/include/data_types.hpp
+++ b/library/src/include/data_types.hpp
@@ -65,6 +65,8 @@ namespace hiptensor
     template <typename T>
     T readVal(void const* value, hiptensorComputeType_t id);
 
+    void writeVal(void const* addr, hiptensorComputeType_t id, double value);
+
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);
diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index 15972d60..de834d72 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -29,9 +29,17 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
     add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
     add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
     add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
+    add_hiptensor_sample(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp)
+    add_hiptensor_sample(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp)
+    add_hiptensor_sample(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp)
+    add_hiptensor_sample(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp)
     add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
     add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
     add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp)
+    add_hiptensor_sample(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp)
 
 # If building hipTensor samples as a standalone Cmake project
 else()
@@ -44,6 +52,18 @@ else()
     add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
     target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor)
 
+    add_executable(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp)
+    target_link_libraries(simple_contraction_scale_f32_bf16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp)
+    target_link_libraries(simple_contraction_scale_f32_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp)
+    target_link_libraries(simple_contraction_scale_f64 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp)
+    target_link_libraries(simple_contraction_scale_f64_f32 PRIVATE hiptensor::hiptensor)
+
     add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
     target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor)
 
@@ -53,4 +73,15 @@ else()
     add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
     target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor)
 
+    add_executable(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp)
+    target_link_libraries(simple_contraction_bilinear_f32_bf16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp)
+    target_link_libraries(simple_contraction_bilinear_f32_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp)
+    target_link_libraries(simple_contraction_bilinear_f64 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp)
+    target_link_libraries(simple_contraction_bilinear_f64_f32 PRIVATE hiptensor::hiptensor)
 endif()
diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp
new file mode 100644
index 00000000..aaef4a1b
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction.hpp
@@ -0,0 +1,351 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename floatTypeCompute,
+          hipDataType            typeA,
+          hipDataType            typeB,
+          hipDataType            typeC,
+          hiptensorComputeType_t typeCompute>
+int bilinearContractionSample()
+{
+    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+    floatTypeCompute beta  = (floatTypeCompute)1.0f;
+
+    /**********************
+   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
+   *C_{m,n,u,v}
+   **********************/
+
+    std::vector<int> modeC{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeC = modeC.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 4;
+    extent['n'] = 3;
+    extent['u'] = 4;
+    extent['v'] = 3;
+    extent['h'] = 6;
+    extent['k'] = 5;
+
+    std::vector<int64_t> c_ms_ns_lengths;
+    for(auto mode : modeC)
+    {
+        c_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t c_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &c_ms_ns,
+                                                        nmodeC,
+                                                        c_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeC,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsC = std::accumulate(
+        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeC = sizeof(CDataType) * elementsC;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    CDataType* C = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
+
+    void *A_d, *B_d, *C_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
+
+    /*******************
+   * Initialize data
+   *******************/
+    int initMethod = 0; // TODO read value from commandline
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        if(initMethod == 0)
+        {
+            A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            A[i] = (ADataType)(float(i) / 100);
+        }
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        if(initMethod == 0)
+        {
+            B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            B[i] = (BDataType)(float(i) / 100);
+        }
+    }
+
+    for(int64_t i = 0; i < elementsC; i++)
+    {
+        if(initMethod == 0)
+        {
+            C[i] = CDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            C[i] = (BDataType)(float(i) / 100);
+        }
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementC;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             &c_ms_ns,
+                                                             modeC.data(),
+                                                             alignmentRequirementC,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               (void*)&beta,
+                                               C_d,
+                                               C_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor C elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorC;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorC.open("tensor_C_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(C);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(C_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
index 0a4a9314..f6714a2f 100644
--- a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp
@@ -23,17 +23,7 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_bilinear_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -51,292 +41,17 @@ int main(int argc, char* argv[])
     typedef hip_bfloat16 CDataType;
     typedef float        floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_16BF;
-    hipDataType            typeB       = HIP_R_16BF;
-    hipDataType            typeC       = HIP_R_16BF;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.1f;
-    floatTypeCompute beta  = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
-   *C_{m,n,u,v}
-   **********************/
-
-    std::vector<int> modeC{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeC = modeC.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> c_ms_ns_lengths;
-    for(auto mode : modeC)
-    {
-        c_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t c_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &c_ms_ns,
-                                                        nmodeC,
-                                                        c_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeC,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsC = std::accumulate(
-        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeC = sizeof(CDataType) * elementsC;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    CDataType* C = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
-
-    void *A_d, *B_d, *C_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsC; i++)
-    {
-        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementC;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               (void*)&beta,
-                                               C_d,
-                                               C_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor C elements:\n";
-            hiptensorPrintArrayElements(std::cout, C, elementsC);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorC;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorC.open("tensor_C_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
-        tensorC.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(C);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(C_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_16BF;
+    constexpr hipDataType            typeB       = HIP_R_16BF;
+    constexpr hipDataType            typeC       = HIP_R_16BF;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16.cpp
index d9d044c9..40708c77 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f16.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f16.cpp
@@ -23,17 +23,7 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_bilinear_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -51,292 +41,17 @@ int main(int argc, char* argv[])
     typedef _Float16 CDataType;
     typedef float    floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_16F;
-    hipDataType            typeB       = HIP_R_16F;
-    hipDataType            typeC       = HIP_R_16F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.1f;
-    floatTypeCompute beta  = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
-   *C_{m,n,u,v}
-   **********************/
-
-    std::vector<int> modeC{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeC = modeC.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> c_ms_ns_lengths;
-    for(auto mode : modeC)
-    {
-        c_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t c_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &c_ms_ns,
-                                                        nmodeC,
-                                                        c_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeC,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsC = std::accumulate(
-        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeC = sizeof(CDataType) * elementsC;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    CDataType* C = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
-
-    void *A_d, *B_d, *C_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsC; i++)
-    {
-        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementC;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               (void*)&beta,
-                                               C_d,
-                                               C_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor C elements:\n";
-            hiptensorPrintArrayElements(std::cout, C, elementsC);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorC;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorC.open("tensor_C_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
-        tensorC.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(C);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(C_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_16F;
+    constexpr hipDataType            typeB       = HIP_R_16F;
+    constexpr hipDataType            typeC       = HIP_R_16F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32.cpp
index 5704a59d..ee046145 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f32.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f32.cpp
@@ -23,17 +23,7 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_bilinear_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -51,292 +41,17 @@ int main(int argc, char* argv[])
     typedef float CDataType;
     typedef float floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_32F;
-    hipDataType            typeB       = HIP_R_32F;
-    hipDataType            typeC       = HIP_R_32F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.1f;
-    floatTypeCompute beta  = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
-   *C_{m,n,u,v}
-   **********************/
-
-    std::vector<int> modeC{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeC = modeC.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> c_ms_ns_lengths;
-    for(auto mode : modeC)
-    {
-        c_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t c_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &c_ms_ns,
-                                                        nmodeC,
-                                                        c_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeC,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsC = std::accumulate(
-        c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeC = sizeof(CDataType) * elementsC;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    CDataType* C = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC));
-
-    void *A_d, *B_d, *C_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&C_d), sizeC));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsC; i++)
-    {
-        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast<const void*>(C), sizeC, hipMemcpyHostToDevice));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementC;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "c_ms_ns: " << c_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             &c_ms_ns,
-                                                             modeC.data(),
-                                                             alignmentRequirementC,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               (void*)&beta,
-                                               C_d,
-                                               C_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor C elements:\n";
-            hiptensorPrintArrayElements(std::cout, C, elementsC);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorC;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorC.open("tensor_C_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
-        tensorC.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(C);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(C_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp
new file mode 100644
index 00000000..42f60ecb
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float        ADataType;
+    typedef float        BDataType;
+    typedef float        CDataType;
+    typedef hip_bfloat16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp
new file mode 100644
index 00000000..d39a4fca
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float    ADataType;
+    typedef float    BDataType;
+    typedef float    CDataType;
+    typedef _Float16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64.cpp
new file mode 100644
index 00000000..412ebbc5
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f64.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double CDataType;
+    typedef double floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeC       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp
new file mode 100644
index 00000000..673c4768
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double CDataType;
+    typedef float  floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeC       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp
new file mode 100644
index 00000000..e9d482c3
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction.hpp
@@ -0,0 +1,341 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include <algorithm>
+#include <fstream>
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+#include <hiptensor/internal/hiptensor_utility.hpp>
+#include <iterator>
+#include <numeric>
+#include <unordered_map>
+
+#include "common.hpp"
+
+template <typename ADataType,
+          typename BDataType,
+          typename DDataType,
+          typename floatTypeCompute,
+          hipDataType            typeA,
+          hipDataType            typeB,
+          hipDataType            typeD,
+          hiptensorComputeType_t typeCompute>
+int scaleContractionSample()
+{
+    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+    /**********************
+   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
+   **********************/
+
+    std::vector<int> modeD{'m', 'n', 'u', 'v'};
+    std::vector<int> modeA{'m', 'n', 'h', 'k'};
+    std::vector<int> modeB{'u', 'v', 'h', 'k'};
+
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeD = modeD.size();
+
+    std::unordered_map<int, int64_t> extent;
+
+    extent['m'] = 4;
+    extent['n'] = 3;
+    extent['u'] = 4;
+    extent['v'] = 3;
+    extent['h'] = 6;
+    extent['k'] = 5;
+
+    std::vector<int64_t> d_ms_ns_lengths;
+    for(auto mode : modeD)
+    {
+        d_ms_ns_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> a_ms_ks_lengths;
+    for(auto mode : modeA)
+    {
+        a_ms_ks_lengths.push_back(extent[mode]);
+    }
+
+    std::vector<int64_t> b_ns_ks_lengths;
+    for(auto mode : modeB)
+    {
+        b_ns_ks_lengths.push_back(extent[mode]);
+    }
+
+    hiptensorHandle_t* handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+
+    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
+
+    /********************************************
+   * Initialize tensors with the input lengths *
+   ********************************************/
+    hiptensorTensorDescriptor_t a_ms_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &a_ms_ks,
+                                                        nmodeA,
+                                                        a_ms_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeA,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t b_ns_ks;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &b_ns_ks,
+                                                        nmodeB,
+                                                        b_ns_ks_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeB,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t d_ms_ns;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
+                                                        &d_ms_ns,
+                                                        nmodeD,
+                                                        d_ms_ns_lengths.data(),
+                                                        NULL, /*stride*/
+                                                        typeD,
+                                                        HIPTENSOR_OP_IDENTITY));
+
+    /**********************
+   * Allocating data
+   **********************/
+    std::cout << "Initializing host data..." << std::endl;
+
+    size_t elementsA = std::accumulate(
+        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsB = std::accumulate(
+        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
+    size_t elementsD = std::accumulate(
+        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
+
+    size_t sizeA = sizeof(ADataType) * elementsA;
+    size_t sizeB = sizeof(BDataType) * elementsB;
+    size_t sizeD = sizeof(DDataType) * elementsD;
+
+    ADataType* A = nullptr;
+    BDataType* B = nullptr;
+    DDataType* D = nullptr;
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
+    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
+
+    void *A_d, *B_d, *D_d;
+
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
+    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
+
+    /*******************
+   * Initialize data
+   *******************/
+    int initMethod = 0; // TODO read the value from command line
+    for(int64_t i = 0; i < elementsA; i++)
+    {
+        if(initMethod == 0)
+        {
+            A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            A[i] = (ADataType)(float(i) / 100);
+        }
+    }
+
+    for(int64_t i = 0; i < elementsB; i++)
+    {
+        if(initMethod == 0)
+        {
+            B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            B[i] = (BDataType)(float(i) / 100);
+        }
+    }
+
+    for(int64_t i = 0; i < elementsD; i++)
+    {
+        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
+    }
+
+    /********************************************
+   * Transfer the Host Tensor to Device Memory *
+   ********************************************/
+    std::cout << "Initializing device data..." << std::endl;
+
+    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
+
+    /************************************************
+   * Retrieve the memory alignment for each tensor
+   ************************************************/
+    uint32_t alignmentRequirementA;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
+
+    uint32_t alignmentRequirementD;
+    CHECK_HIPTENSOR_ERROR(
+        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
+
+    /*******************************
+   * Create Contraction Descriptor
+   *******************************/
+
+    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
+
+    hiptensorContractionDescriptor_t desc;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
+                                                             &desc,
+                                                             &a_ms_ks,
+                                                             modeA.data(),
+                                                             alignmentRequirementA,
+                                                             &b_ns_ks,
+                                                             modeB.data(),
+                                                             alignmentRequirementB,
+                                                             nullptr,
+                                                             nullptr,
+                                                             0,
+                                                             &d_ms_ns,
+                                                             modeD.data(),
+                                                             alignmentRequirementD,
+                                                             typeCompute));
+    /**************************
+   * Set the algorithm to use
+   ***************************/
+
+    hiptensorContractionFind_t find;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
+
+    /**********************
+   * Query workspace
+   **********************/
+
+    uint64_t worksize = 0;
+    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
+        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void* workspace = nullptr;
+
+    if(worksize > 0)
+    {
+        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
+    }
+
+    /**************************
+   * Create Contraction Plan
+   **************************/
+    std::cout << "Initializing contraction plan..." << std::endl;
+
+    hiptensorContractionPlan_t plan;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
+
+    std::cout << "Launching contraction kernel..." << std::endl;
+
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
+                                               &plan,
+                                               (void*)&alpha,
+                                               A_d,
+                                               B_d,
+                                               nullptr,
+                                               nullptr,
+                                               D_d,
+                                               workspace,
+                                               worksize,
+                                               0 /* stream */));
+
+#if !NDEBUG
+    bool printElements = false;
+    bool storeElements = false;
+
+    if(printElements || storeElements)
+    {
+        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
+    }
+
+    if(printElements)
+    {
+        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor A elements:\n";
+            hiptensorPrintArrayElements(std::cout, A, elementsA);
+            std::cout << std::endl;
+        }
+
+        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor B elements:\n";
+            hiptensorPrintArrayElements(std::cout, B, elementsB);
+            std::cout << std::endl;
+        }
+
+        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, D, elementsD);
+            std::cout << std::endl;
+        }
+    }
+
+    if(storeElements)
+    {
+        std::ofstream tensorA, tensorB, tensorD;
+        tensorA.open("tensor_A.txt");
+        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
+        tensorA.close();
+
+        tensorB.open("tensor_B.txt");
+        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
+        tensorB.close();
+
+        tensorD.open("tensor_D_scale_contraction_results.txt");
+        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
+        tensorD.close();
+    }
+
+#endif
+
+    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
+
+    HIPTENSOR_FREE_HOST(A);
+    HIPTENSOR_FREE_HOST(B);
+    HIPTENSOR_FREE_HOST(D);
+
+    HIPTENSOR_FREE_DEVICE(A_d);
+    HIPTENSOR_FREE_DEVICE(B_d);
+    HIPTENSOR_FREE_DEVICE(D_d);
+    HIPTENSOR_FREE_DEVICE(workspace);
+
+    std::cout << "Finished!" << std::endl;
+
+    return 0;
+}
diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16.cpp
index e05916bf..7b0f8b6c 100644
--- a/samples/01_contraction/simple_scale_contraction_bf16.cpp
+++ b/samples/01_contraction/simple_scale_contraction_bf16.cpp
@@ -23,16 +23,7 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_scale_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -44,291 +35,17 @@ int main(int argc, char* argv[])
     typedef hip_bfloat16 DDataType;
     typedef float        floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_16BF;
-    hipDataType            typeB       = HIP_R_16BF;
-    hipDataType            typeD       = HIP_R_16BF;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
-   **********************/
-
-    std::vector<int> modeD{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeD = modeD.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> d_ms_ns_lengths;
-    for(auto mode : modeD)
-    {
-        d_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t d_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &d_ms_ns,
-                                                        nmodeD,
-                                                        d_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeD,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsD = std::accumulate(
-        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeD = sizeof(DDataType) * elementsD;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    DDataType* D = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
-
-    void *A_d, *B_d, *D_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsD; i++)
-    {
-        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementD;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             nullptr,
-                                                             nullptr,
-                                                             0,
-                                                             &d_ms_ns,
-                                                             modeD.data(),
-                                                             alignmentRequirementD,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               nullptr,
-                                               nullptr,
-                                               D_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor D elements:\n";
-            hiptensorPrintArrayElements(std::cout, D, elementsD);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorD;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorD.open("tensor_D_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
-        tensorD.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(D);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(D_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_16BF;
+    constexpr hipDataType            typeB       = HIP_R_16BF;
+    constexpr hipDataType            typeD       = HIP_R_16BF;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16.cpp
index 1e62be85..d69193f0 100644
--- a/samples/01_contraction/simple_scale_contraction_f16.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f16.cpp
@@ -23,312 +23,35 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
-
-#include "common.hpp"
+#include "simple_scale_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
     /***************************************
    * Check device support                 *
    **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
     typedef _Float16 ADataType;
     typedef _Float16 BDataType;
     typedef _Float16 DDataType;
     typedef float    floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_16F;
-    hipDataType            typeB       = HIP_R_16F;
-    hipDataType            typeD       = HIP_R_16F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
-   **********************/
-
-    std::vector<int> modeD{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeD = modeD.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> d_ms_ns_lengths;
-    for(auto mode : modeD)
-    {
-        d_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t d_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &d_ms_ns,
-                                                        nmodeD,
-                                                        d_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeD,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsD = std::accumulate(
-        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeD = sizeof(DDataType) * elementsD;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    DDataType* D = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
-
-    void *A_d, *B_d, *D_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsD; i++)
-    {
-        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementD;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             nullptr,
-                                                             nullptr,
-                                                             0,
-                                                             &d_ms_ns,
-                                                             modeD.data(),
-                                                             alignmentRequirementD,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               nullptr,
-                                               nullptr,
-                                               D_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor D elements:\n";
-            hiptensorPrintArrayElements(std::cout, D, elementsD);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorD;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorD.open("tensor_D_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
-        tensorD.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(D);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(D_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_16F;
+    constexpr hipDataType            typeB       = HIP_R_16F;
+    constexpr hipDataType            typeD       = HIP_R_16F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32.cpp
index c76ec370..e53cc468 100644
--- a/samples/01_contraction/simple_scale_contraction_f32.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f32.cpp
@@ -23,16 +23,8 @@
  * THE SOFTWARE.
  *
  *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <hiptensor/hiptensor.hpp>
-#include <hiptensor/hiptensor_types.hpp>
-#include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iterator>
-#include <numeric>
-#include <unordered_map>
 
-#include "common.hpp"
+#include "simple_scale_contraction.hpp"
 
 int main(int argc, char* argv[])
 {
@@ -50,291 +42,17 @@ int main(int argc, char* argv[])
     typedef float DDataType;
     typedef float floatTypeCompute;
 
-    hipDataType            typeA       = HIP_R_32F;
-    hipDataType            typeB       = HIP_R_32F;
-    hipDataType            typeD       = HIP_R_32F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
-
-    /**********************
-   * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
-   **********************/
-
-    std::vector<int> modeD{'m', 'n', 'u', 'v'};
-    std::vector<int> modeA{'m', 'n', 'h', 'k'};
-    std::vector<int> modeB{'u', 'v', 'h', 'k'};
-
-    int nmodeA = modeA.size();
-    int nmodeB = modeB.size();
-    int nmodeD = modeD.size();
-
-    std::unordered_map<int, int64_t> extent;
-
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
-
-    std::vector<int64_t> d_ms_ns_lengths;
-    for(auto mode : modeD)
-    {
-        d_ms_ns_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> a_ms_ks_lengths;
-    for(auto mode : modeA)
-    {
-        a_ms_ks_lengths.push_back(extent[mode]);
-    }
-
-    std::vector<int64_t> b_ns_ks_lengths;
-    for(auto mode : modeB)
-    {
-        b_ns_ks_lengths.push_back(extent[mode]);
-    }
-
-    hiptensorHandle_t* handle;
-    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
-
-    CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE));
-
-    /********************************************
-   * Initialize tensors with the input lengths *
-   ********************************************/
-    hiptensorTensorDescriptor_t a_ms_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &a_ms_ks,
-                                                        nmodeA,
-                                                        a_ms_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeA,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t b_ns_ks;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &b_ns_ks,
-                                                        nmodeB,
-                                                        b_ns_ks_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeB,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    hiptensorTensorDescriptor_t d_ms_ns;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle,
-                                                        &d_ms_ns,
-                                                        nmodeD,
-                                                        d_ms_ns_lengths.data(),
-                                                        NULL, /*stride*/
-                                                        typeD,
-                                                        HIPTENSOR_OP_IDENTITY));
-
-    /**********************
-   * Allocating data
-   **********************/
-    std::cout << "Initializing host data..." << std::endl;
-
-    size_t elementsA = std::accumulate(
-        a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsB = std::accumulate(
-        b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies<size_t>());
-    size_t elementsD = std::accumulate(
-        d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies<size_t>());
-
-    size_t sizeA = sizeof(ADataType) * elementsA;
-    size_t sizeB = sizeof(BDataType) * elementsB;
-    size_t sizeD = sizeof(DDataType) * elementsD;
-
-    ADataType* A = nullptr;
-    BDataType* B = nullptr;
-    DDataType* D = nullptr;
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB));
-    CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD));
-
-    void *A_d, *B_d, *D_d;
-
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&A_d), sizeA));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&B_d), sizeB));
-    CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&D_d), sizeD));
-
-    /*******************
-   * Initialize data
-   *******************/
-    for(int64_t i = 0; i < elementsA; i++)
-    {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsB; i++)
-    {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
-    }
-
-    for(int64_t i = 0; i < elementsD; i++)
-    {
-        D[i] = std::numeric_limits<DDataType>::signaling_NaN();
-    }
-
-    /********************************************
-   * Transfer the Host Tensor to Device Memory *
-   ********************************************/
-    std::cout << "Initializing device data..." << std::endl;
-
-    CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast<const void*>(A), sizeA, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast<const void*>(B), sizeB, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD));
-
-    /************************************************
-   * Retrieve the memory alignment for each tensor
-   ************************************************/
-    uint32_t alignmentRequirementA;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
-
-    uint32_t alignmentRequirementB;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB));
-
-    uint32_t alignmentRequirementD;
-    CHECK_HIPTENSOR_ERROR(
-        hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD));
-
-    /*******************************
-   * Create Contraction Descriptor
-   *******************************/
-
-    std::cout << "a_ms_ks: " << a_ms_ks << std::endl;
-    std::cout << "b_ns_ks: " << b_ns_ks << std::endl;
-    std::cout << "d_ms_ns: " << d_ms_ns << std::endl;
-
-    hiptensorContractionDescriptor_t desc;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle,
-                                                             &desc,
-                                                             &a_ms_ks,
-                                                             modeA.data(),
-                                                             alignmentRequirementA,
-                                                             &b_ns_ks,
-                                                             modeB.data(),
-                                                             alignmentRequirementB,
-                                                             nullptr,
-                                                             nullptr,
-                                                             0,
-                                                             &d_ms_ns,
-                                                             modeD.data(),
-                                                             alignmentRequirementD,
-                                                             typeCompute));
-    /**************************
-   * Set the algorithm to use
-   ***************************/
-
-    hiptensorContractionFind_t find;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT));
-
-    /**********************
-   * Query workspace
-   **********************/
-
-    uint64_t worksize = 0;
-    CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize(
-        handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize));
-
-    void* workspace = nullptr;
-
-    if(worksize > 0)
-    {
-        CHECK_HIP_ERROR(hipMalloc(static_cast<void**>(&workspace), worksize));
-    }
-
-    /**************************
-   * Create Contraction Plan
-   **************************/
-    std::cout << "Initializing contraction plan..." << std::endl;
-
-    hiptensorContractionPlan_t plan;
-    CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
-
-    std::cout << "Launching contraction kernel..." << std::endl;
-
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               nullptr,
-                                               nullptr,
-                                               D_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
-
-    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-
-#if !NDEBUG
-    bool printElements = false;
-    bool storeElements = false;
-
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-    }
-
-    if(printElements)
-    {
-        if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor A elements:\n";
-            hiptensorPrintArrayElements(std::cout, A, elementsA);
-            std::cout << std::endl;
-        }
-
-        if(elementsB < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor B elements:\n";
-            hiptensorPrintArrayElements(std::cout, B, elementsB);
-            std::cout << std::endl;
-        }
-
-        if(elementsD < MAX_ELEMENTS_PRINT_COUNT)
-        {
-            std::cout << "Tensor D elements:\n";
-            hiptensorPrintArrayElements(std::cout, D, elementsD);
-            std::cout << std::endl;
-        }
-    }
-
-    if(storeElements)
-    {
-        std::ofstream tensorA, tensorB, tensorD;
-        tensorA.open("tensor_A.txt");
-        hiptensorPrintElementsToFile(tensorA, A, elementsA, ", ");
-        tensorA.close();
-
-        tensorB.open("tensor_B.txt");
-        hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
-        tensorB.close();
-
-        tensorD.open("tensor_D_scale_contraction_results.txt");
-        hiptensorPrintElementsToFile(tensorD, D, elementsD, ", ");
-        tensorD.close();
-    }
-
-#endif
-
-    CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle));
-
-    HIPTENSOR_FREE_HOST(A);
-    HIPTENSOR_FREE_HOST(B);
-    HIPTENSOR_FREE_HOST(D);
-
-    HIPTENSOR_FREE_DEVICE(A_d);
-    HIPTENSOR_FREE_DEVICE(B_d);
-    HIPTENSOR_FREE_DEVICE(D_d);
-    HIPTENSOR_FREE_DEVICE(workspace);
-
-    std::cout << "Finished!" << std::endl;
-
-    return 0;
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp
new file mode 100644
index 00000000..c11b8ded
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float        ADataType;
+    typedef float        BDataType;
+    typedef float        DDataType;
+    typedef hip_bfloat16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f16.cpp
new file mode 100644
index 00000000..377ee707
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f32_f16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float    ADataType;
+    typedef float    BDataType;
+    typedef float    DDataType;
+    typedef _Float16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64.cpp
new file mode 100644
index 00000000..5eb94c15
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f64.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF64Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double DDataType;
+    typedef double floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeD       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f64_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f32.cpp
new file mode 100644
index 00000000..fdec48ab
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f64_f32.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF64Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double DDataType;
+    typedef float  floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeD       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index a08065a0..08ddf0b2 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -3,8 +3,11 @@ Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
   - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
   - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
-  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F]
-  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ]
+  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
+  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index b28e9a88..08ddf0b2 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -4,7 +4,10 @@ Tensor Data Types:
   - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
   - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
   - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
+  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 9446157f..ce67278f 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -125,6 +125,9 @@ namespace hiptensor
                     || (CDataType == NONE_TYPE));
         EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF)
                     || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F));
+        EXPECT_TRUE(
+            (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF)
+            || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F));
 
         mRunFlag &= checkDevice(DDataType);
 
@@ -488,7 +491,11 @@ namespace hiptensor
             auto CDataType = testType[2];
             auto DDataType = testType[3];
 
-            auto computeType = convertToComputeType(testType[4]);
+            auto   computeType = convertToComputeType(testType[4]);
+            double alphaBuf    = 0.;
+            double betaBuf     = 0.;
+            writeVal(&alphaBuf, computeType, alpha);
+            writeVal(&betaBuf, computeType, beta);
 
             CHECK_HIPTENSOR_ERROR(
                 hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
@@ -497,20 +504,21 @@ namespace hiptensor
 
             CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
                                                        &plan,
-                                                       (void*)&alpha,
+                                                       (void*)&alphaBuf,
                                                        resource->deviceA().get(),
                                                        resource->deviceB().get(),
-                                                       (void*)&beta,
+                                                       (void*)&betaBuf,
                                                        resource->deviceC().get(),
                                                        resource->deviceD().get(),
                                                        workspace,
                                                        worksize,
                                                        0 /* stream */));
 
-            CHECK_HIPTENSOR_ERROR(hiptensorContractionReference((void*)&alpha,
+            CHECK_HIPTENSOR_ERROR(hiptensorContractionReference(&plan,
+                                                                (void*)&alphaBuf,
                                                                 resource->hostA().get(),
                                                                 resource->hostB().get(),
-                                                                (void*)&beta,
+                                                                (void*)&betaBuf,
                                                                 resource->hostC().get(),
                                                                 resource->hostD().get(),
                                                                 a_ms_ks.mLengths,

From ab8d557e0e68d29c5d3b17020c5c43ef898ede8f Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Thu, 30 Nov 2023 18:46:46 +0000
Subject: [PATCH 10/42] Add placeholder for solution unique_id

Solution unique_ids of Actor Critic are have not been ready yet, but we
put some placeholders in the new Actor Critic to make the unit tests be
able to pass.
---
 .../src/contraction/contraction_selection.cpp  | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index 888ef4c1..68c748b0 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -72,8 +72,8 @@ namespace hiptensor
                      * hipDataTypeSize(typeE);
 
         void * A_d, *B_d, *D_d, *E_d, *wspace;
-        double alpha = 0.0d;
-        double beta  = 0.0d;
+        double alpha = 0.0;
+        double beta  = 0.0;
         writeVal(&alpha, computeType, 1.02);
         writeVal(&beta, computeType, 1.03);
 
@@ -188,7 +188,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
-            unique_id = 7255639152084218514;
+            unique_id = 7255639152084218514ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -237,7 +237,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
-            unique_id = 7255639152084218514;
+            unique_id = 7255639152084218514ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -286,7 +286,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
-            unique_id = 8689089455041651212;
+            unique_id = 8689089455041651212ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -335,7 +335,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
-            unique_id = 8689089455041651212;
+            unique_id = 8689089455041651212ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -379,6 +379,7 @@ namespace hiptensor
             size_t unique_id = 0;
 
             // TODO select unique_id
+            unique_id = 1078559130597702989ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -421,6 +422,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
             // TODO select unique_id
+            unique_id = 6506383527825239632ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -463,6 +465,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
             // TODO select unique_id
+            unique_id = 14486135440731032454ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -510,6 +513,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
             // TODO select unique_id
+            unique_id = 11931735240548010466ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1276,6 +1280,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
             // TODO select unique_id
+            unique_id = 11912251726020349830ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1317,6 +1322,7 @@ namespace hiptensor
             int d6 = a_ms_ks_lengths[3];
 
             size_t unique_id = 0;
+            unique_id        = 15375432626310194825ull;
             // TODO select unique_id
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())

From df27e326d15a65118a657b04c63eef37ecde946e Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Mon, 4 Dec 2023 16:12:44 +0000
Subject: [PATCH 11/42] Update contraction device instances

Update contraction device instances since CK has updated them.
---
 ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 62 ++++++-----------
 ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 65 ++++++------------
 ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 65 ++++++------------
 ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 65 ++++++------------
 ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 59 ++++++----------
 ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 59 ++++++----------
 ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 59 ++++++----------
 ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 59 ++++++----------
 ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 65 +++++++-----------
 ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 68 +++++++------------
 ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 68 +++++++------------
 ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 68 +++++++------------
 ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 27 +++++++-
 ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 27 +++++++-
 ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 27 +++++++-
 ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 27 +++++++-
 ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 58 ++++++----------
 ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 58 ++++++----------
 ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 58 ++++++----------
 ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 58 ++++++----------
 20 files changed, 460 insertions(+), 642 deletions(-)

diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
index d8b80eb9..f924889f 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,42 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
@@ -89,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
index 5444adc3..ad94eb1f 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
index b20c1204..8fb870a0 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
index 2bc3d1f2..aa3e9d32 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
index a1fe1ddf..a65ae1eb 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
index a635bce8..4d6ccaa8 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
index c77ffea4..071ccf62 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
index c8a96a70..d8223df7 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
index 88345e74..24d2d570 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,42 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // k/k/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -88,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
index 38702afd..f559dc06 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // k/n/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
index 735a5e34..a522052d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // m/k/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
index d286e2d8..be35683b 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // m/n/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
index 04176d80..dac46620 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
index 06481fc7..0830b49f 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
index 94922008..9a716ba3 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
index e70b854b..e02ac144 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
@@ -1,5 +1,28 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
 
 // This (ifndef) is a hack to use customized behavior for buffer load rather than using default
 // setting Don't use this hack unless absolutely necessary!
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
index f8904a8f..6f168ee2 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
index 56fc8b91..347a810c 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
index 231a0256..229d18c7 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
index 4fc648d4..bf1efa14 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,

From f85df837f3ae885178b197f8c2435c14e9847a2c Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Mon, 4 Dec 2023 16:29:00 +0000
Subject: [PATCH 12/42] Print C in sample output

1. Initiate the data with 0.01, 0.02, ... by default
2. Print C
---
 .../simple_bilinear_contraction.hpp           | 22 ++++++++++++++-----
 .../simple_scale_contraction.hpp              |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp
index aaef4a1b..27001232 100644
--- a/samples/01_contraction/simple_bilinear_contraction.hpp
+++ b/samples/01_contraction/simple_bilinear_contraction.hpp
@@ -154,7 +154,7 @@ int bilinearContractionSample()
     /*******************
    * Initialize data
    *******************/
-    int initMethod = 0; // TODO read value from commandline
+    int initMethod = 1; // TODO read value from commandline
     for(int64_t i = 0; i < elementsA; i++)
     {
         if(initMethod == 0)
@@ -287,11 +287,6 @@ int bilinearContractionSample()
     bool printElements = false;
     bool storeElements = false;
 
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
     if(printElements)
     {
         if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
@@ -314,6 +309,15 @@ int bilinearContractionSample()
             hiptensorPrintArrayElements(std::cout, C, elementsC);
             std::cout << std::endl;
         }
+
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
     }
 
     if(storeElements)
@@ -327,6 +331,12 @@ int bilinearContractionSample()
         hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
         tensorB.close();
 
+        tensorC.open("tensor_C.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+
         tensorC.open("tensor_C_scale_contraction_results.txt");
         hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
         tensorC.close();
diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp
index e9d482c3..78b026b6 100644
--- a/samples/01_contraction/simple_scale_contraction.hpp
+++ b/samples/01_contraction/simple_scale_contraction.hpp
@@ -151,7 +151,7 @@ int scaleContractionSample()
     /*******************
    * Initialize data
    *******************/
-    int initMethod = 0; // TODO read the value from command line
+    int initMethod = 1; // TODO read the value from command line
     for(int64_t i = 0; i < elementsA; i++)
     {
         if(initMethod == 0)

From 5c45a8c80dd0e90171a791bd945c0e41b84ef22d Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Tue, 5 Dec 2023 18:15:54 +0000
Subject: [PATCH 13/42] Set CK contraction instance only run once

When logger level is set to HIPTENSOR_LOG_LEVEL_PERF_TRACE, we make CK
instances measure the running time. The problem is that CK internally
will run the contraction 10 times by default. This leads to an issues:

1. It returns wrong result for C = alpha A x B + beta C

Set StreamConfig.nrepeat_ = 1, the contraction will be run once
---
 library/src/contraction/hiptensor_contraction.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index c7b7501b..8148eeaa 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -720,7 +720,13 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         // Perform contraction with timing if LOG_LEVEL_PERF_TRACE
         if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE)
         {
-            auto time = (*cSolution)(StreamConfig{stream, true});
+            auto time = (*cSolution)(StreamConfig{
+                stream, // stream id
+                true, // time_kernel
+                0, // log_level
+                0, // cold_niters
+                1, // nrepeat
+            });
             if(time < 0)
             {
                 return HIPTENSOR_STATUS_CK_ERROR;

From f631818937db143e42d444d4a0c2ce5646ad525e Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Tue, 5 Dec 2023 23:57:33 +0000
Subject: [PATCH 14/42] Fixed a bug in CPU reference

1. ck::bhalf_t cannot cast to float or double by static_cast.
Use ck::type_convert() to fix it.

2. epsilon() is not good value to measure the relative difference of
data. It is too small for double (eps < 10e-13).
---
 .../contraction_cpu_reference_impl.hpp          | 17 +++++++----------
 .../configs/bilinear_test_params.yaml           |  2 +-
 .../configs/scale_test_params.yaml              |  2 +-
 test/utils.hpp                                  |  9 ++++-----
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index ac4fc20d..a9a9d176 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -53,7 +53,6 @@ namespace hiptensor
         typename BDataType,
         typename DsDataType,
         typename EDataType,
-        typename AccDataType,
         typename AElementwiseOperation,
         typename BElementwiseOperation,
         typename CDEElementwiseOperation,
@@ -152,7 +151,7 @@ namespace hiptensor
                 };
 
                 auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                    auto accum = static_cast<AccDataType>(0);
+                    float accum = 0.0f;
 
                     auto K0 = arg.mA_ms_ks_lengths[2];
                     auto K1 = arg.mA_ms_ks_lengths[3];
@@ -174,8 +173,7 @@ namespace hiptensor
                             arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]);
 
                             // Mult / accum
-                            accum
-                                += static_cast<AccDataType>(valA) * static_cast<AccDataType>(valB);
+                            accum += ck::type_convert<float>(valA) * ck::type_convert<float>(valB);
                         }
                     }
 
@@ -184,15 +182,17 @@ namespace hiptensor
                     if constexpr(std::is_same_v<CDEElementwiseOperation,
                                                 ck::tensor_operation::element_wise::Scale>)
                     {
-                        arg.mOpCDE(((EDataType*)arg.mE)[indexE], accum);
+                        arg.mOpCDE(((EDataType*)arg.mE)[indexE],
+                                   ck::type_convert<EDataType>(accum));
                     }
                     else // bilinear
                     {
                         // NumDTensor will be 1 due to SFINAE of this class
                         auto indexD
                             = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
-                        arg.mOpCDE(
-                            ((EDataType*)arg.mE)[indexE], accum, ((EDataType*)(arg.mD[0]))[indexD]);
+                        arg.mOpCDE(((EDataType*)arg.mE)[indexE],
+                                   ck::type_convert<EDataType>(accum),
+                                   ((EDataType*)(arg.mD[0]))[indexD]);
                     }
                 };
 
@@ -323,7 +323,6 @@ namespace hiptensor
               typename BDataType,
               typename DsDataType,
               typename EDataType,
-              typename AccumDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
               typename CDEElementwiseOperation,
@@ -335,7 +334,6 @@ namespace hiptensor
                                                     BDataType,
                                                     DsDataType,
                                                     EDataType,
-                                                    AccumDataType,
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
                                                     CDEElementwiseOperation,
@@ -375,7 +373,6 @@ namespace hiptensor
                                                           BDataType,
                                                           DsDataType,
                                                           EDataType,
-                                                          EDataType,
                                                           AElementwiseOperation,
                                                           BElementwiseOperation,
                                                           CDEElementwiseOperation,
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index 08ddf0b2..eee5d7f1 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -11,7 +11,7 @@ Tensor Data Types:
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
-  - HIPTENSOR_ALGO_ACTOR_CRITIC
+  # - HIPTENSOR_ALGO_ACTOR_CRITIC
 Operators:
   - HIPTENSOR_OP_IDENTITY
 Worksize Prefs:
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index 08ddf0b2..eee5d7f1 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -11,7 +11,7 @@ Tensor Data Types:
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
-  - HIPTENSOR_ALGO_ACTOR_CRITIC
+  # - HIPTENSOR_ALGO_ACTOR_CRITIC
 Operators:
   - HIPTENSOR_OP_IDENTITY
 Worksize Prefs:
diff --git a/test/utils.hpp b/test/utils.hpp
index ad4bb565..05daf544 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -140,7 +140,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqual(DDataType const* deviceD,
                                      DDataType const* hostD,
                                      std::size_t      elementsD,
-                                     double           tolerance = 100.0)
+                                     double           tolerance = 0.005)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -202,7 +202,7 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
         retval             = false;
         max_relative_error = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(max_relative_error > (eps * tolerance))
+    else if(max_relative_error > tolerance)
     {
         retval = false;
     }
@@ -214,7 +214,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
                                                  std::size_t elementsD,
-                                                 double      tolerance = 100.0)
+                                                 double      tolerance = 0.005)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);
@@ -276,13 +276,12 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
-    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
     if(isNaN)
     {
         retval           = false;
         maxRelativeError = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(maxRelativeError > (eps * tolerance))
+    else if(maxRelativeError > tolerance)
     {
         retval = false;
     }

From e5cefe79a7e4630b4e1f07edd425a6cba6fda519 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 6 Dec 2023 01:43:48 +0000
Subject: [PATCH 15/42] Add commnets

---
 library/src/contraction/contraction_meta_traits.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index 6a7cb35f..e66ac432 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -67,6 +67,14 @@ namespace hiptensor
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
         constexpr static ck::index_t DimsK = NumDimsK;
+        /*
+         * CK does not use hip_bfloat16, instead it use ushort(ck::bhalf_t) for cuda bhalf_t type.
+         * What we want here is that we can use ck::bhalf_t with ck instances and use hip_bfloat16
+         * with hiptensor classes.
+         *
+         * When creating a solution, ck::bhalf_t was passed in to create ck instance.
+         * When registering the solution, MetaTraits will returen hip_bfloat16 to create key.
+         */
         using ADataT
             = std::conditional_t<std::is_same_v<ADataType, ck::bhalf_t>, hip_bfloat16, ADataType>;
         using BDataT

From 4345a1c5b4b32fa427a8880a944895b3947ee6dd Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 6 Dec 2023 17:14:53 +0000
Subject: [PATCH 16/42] Rename contraction sameple files

The pattern of contraction sameple file is

- bilinear: simple_bilinear_contraction_<A>_<B>_<C>_<D>_compute_<compute>.cpp
- scale   : simple_scale_contraction_<A>_<B>_<C>_compute_<compute>.cpp
---
 samples/01_contraction/CMakeLists.txt         | 85 ++++++++++---------
 ...tion_bf16_bf16_bf16_bf16_compute_bf16.cpp} |  0
 ...ntraction_f16_f16_f16_f16_compute_f16.cpp} |  0
 ...traction_f32_f32_f32_f32_compute_bf16.cpp} |  0
 ...ntraction_f32_f32_f32_f32_compute_f16.cpp} |  0
 ...ntraction_f32_f32_f32_f32_compute_f32.cpp} |  0
 ...ntraction_f64_f64_f64_f64_compute_f32.cpp} |  0
 ...ntraction_f64_f64_f64_f64_compute_f64.cpp} |  0
 ...ntraction_bf16_bf16_bf16_compute_bf16.cpp} |  0
 ...e_contraction_f16_f16_f16_compute_f16.cpp} |  0
 ..._contraction_f32_f32_f32_compute_bf16.cpp} |  0
 ...e_contraction_f32_f32_f32_compute_f16.cpp} |  0
 ...e_contraction_f32_f32_f32_compute_f32.cpp} |  0
 ...e_contraction_f64_f64_f64_compute_f32.cpp} |  0
 ...e_contraction_f64_f64_f64_compute_f64.cpp} |  0
 15 files changed, 43 insertions(+), 42 deletions(-)
 rename samples/01_contraction/{simple_bilinear_contraction_bf16.cpp => simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f16.cpp => simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f32_bf16.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f32_f16.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f32.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f64_f32.cpp => simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp} (100%)
 rename samples/01_contraction/{simple_bilinear_contraction_f64.cpp => simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_bf16.cpp => simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f16.cpp => simple_scale_contraction_f16_f16_f16_compute_f16.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f32_bf16.cpp => simple_scale_contraction_f32_f32_f32_compute_bf16.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f32_f16.cpp => simple_scale_contraction_f32_f32_f32_compute_f16.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f32.cpp => simple_scale_contraction_f32_f32_f32_compute_f32.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f64_f32.cpp => simple_scale_contraction_f64_f64_f64_compute_f32.cpp} (100%)
 rename samples/01_contraction/{simple_scale_contraction_f64.cpp => simple_scale_contraction_f64_f64_f64_compute_f64.cpp} (100%)

diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index de834d72..00393f1d 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -26,62 +26,63 @@
 
 # Check whether building within hiptensor context
 if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
-    add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
-    add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp)
-    add_hiptensor_sample(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp)
+    add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp)
 
 # If building hipTensor samples as a standalone Cmake project
 else()
-    add_executable(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp)
-    target_link_libraries(simple_contraction_scale_f16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp)
+    target_link_libraries(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp)
-    target_link_libraries(simple_contraction_scale_bf16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
-    target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp)
-    target_link_libraries(simple_contraction_scale_f32_bf16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp)
-    target_link_libraries(simple_contraction_scale_f32_f16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp)
-    target_link_libraries(simple_contraction_scale_f64 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp)
+    target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp)
-    target_link_libraries(simple_contraction_scale_f64_f32 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp)
+    target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp)
-    target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp)
+    target_link_libraries(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp)
-    target_link_libraries(simple_contraction_bilinear_bf16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp)
+    target_link_libraries(simple_scale_contraction_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
-    target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp)
-    target_link_libraries(simple_contraction_bilinear_f32_bf16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp)
-    target_link_libraries(simple_contraction_bilinear_f32_f16 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp)
-    target_link_libraries(simple_contraction_bilinear_f64 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp)
+    target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp)
+    target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp)
-    target_link_libraries(simple_contraction_bilinear_f64_f32 PRIVATE hiptensor::hiptensor)
 endif()
diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_bf16.cpp
rename to samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f16.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f32.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
similarity index 100%
rename from samples/01_contraction/simple_bilinear_contraction_f64.cpp
rename to samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_bf16.cpp
rename to samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f16.cpp
rename to samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f32_bf16.cpp
rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f32_f16.cpp
rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f32.cpp
rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f64_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f64_f32.cpp
rename to samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
diff --git a/samples/01_contraction/simple_scale_contraction_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp
similarity index 100%
rename from samples/01_contraction/simple_scale_contraction_f64.cpp
rename to samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp

From 43f33ee5c6b40d0b4278cd1c221399eb99b16a7d Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 6 Dec 2023 21:02:53 +0000
Subject: [PATCH 17/42] Improve CPU reference accurary

The relative difference between contraction result and CPU reference is
less than 0.1% after the improvement.
---
 library/src/contraction/contraction_cpu_reference_impl.hpp | 3 ++-
 test/utils.hpp                                             | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index a9a9d176..d21df2d3 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -173,7 +173,8 @@ namespace hiptensor
                             arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]);
 
                             // Mult / accum
-                            accum += ck::type_convert<float>(valA) * ck::type_convert<float>(valB);
+                            accum += ck::type_convert<float>(ck::type_convert<ComputeDataType>(
+                                ck::type_convert<float>(valA) * ck::type_convert<float>(valB)));
                         }
                     }
 
diff --git a/test/utils.hpp b/test/utils.hpp
index 05daf544..f39f0fb5 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -140,7 +140,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqual(DDataType const* deviceD,
                                      DDataType const* hostD,
                                      std::size_t      elementsD,
-                                     double           tolerance = 0.005)
+                                     double           tolerance = 0.001)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -214,7 +214,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
                                                  std::size_t elementsD,
-                                                 double      tolerance = 0.005)
+                                                 double      tolerance = 0.001)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);

From fec9065460d2205f9b9478ccd5f69fa51d2a839e Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Wed, 6 Dec 2023 21:19:36 +0000
Subject: [PATCH 18/42] Add comments to explain how to pass alpha value

---
 library/src/contraction/contraction_selection.cpp | 11 ++++++++++-
 test/01_contraction/contraction_test.cpp          | 15 ++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index 68c748b0..9b0cdf9f 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -71,7 +71,16 @@ namespace hiptensor
         auto sizeE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides)
                      * hipDataTypeSize(typeE);
 
-        void * A_d, *B_d, *D_d, *E_d, *wspace;
+        void *A_d, *B_d, *D_d, *E_d, *wspace;
+
+        /*
+         * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha.
+         * ```
+         * alphaF = hiptensor::readVal<float>(
+         *      alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
+         * ```
+         * Hence, the `alpha` and `bete` need to point to a ComputeData value
+         */
         double alpha = 0.0;
         double beta  = 0.0;
         writeVal(&alpha, computeType, 1.02);
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index ce67278f..76cc3033 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -491,9 +491,18 @@ namespace hiptensor
             auto CDataType = testType[2];
             auto DDataType = testType[3];
 
-            auto   computeType = convertToComputeType(testType[4]);
-            double alphaBuf    = 0.;
-            double betaBuf     = 0.;
+            auto computeType = convertToComputeType(testType[4]);
+
+            /*
+             * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha.
+             * ```
+             * alphaF = hiptensor::readVal<float>(
+             *      alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
+             * ```
+             * Hence, the `alpha` and `bete` need to point to a ComputeData value
+             */
+            double alphaBuf = 0.;
+            double betaBuf  = 0.;
             writeVal(&alphaBuf, computeType, alpha);
             writeVal(&betaBuf, computeType, beta);
 

From b21fe0b18881fb6ed5643be7bd2e242f9a4b45a2 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Thu, 7 Dec 2023 02:16:22 +0000
Subject: [PATCH 19/42] Update CPU reference

1. Revert the default threshold of relative difference to (100 * std::numeric_limits<T>::epsilon())
2. Update CPU reference to make the difference between CPU reference and output of contraction instance
is less than (100 * std::numeric_limits<T>::epsilon()).
---
 .../contraction_cpu_reference_impl.hpp        | 29 ++++++++++++++-----
 .../contraction_cpu_reference_instances.cpp   | 14 +++++++++
 .../configs/bilinear_test_params.yaml         |  2 +-
 .../configs/scale_test_params.yaml            |  2 +-
 test/utils.hpp                                |  9 +++---
 5 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index d21df2d3..2e3d0cbe 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -45,19 +45,25 @@
 namespace hiptensor
 {
     // hardcoded for NumDimM == NumDimN == NumDimK == 2
+    //
+    // ck::bhalf_t is ushort, cannot perform bhalf_t * bhalf_t
+    // CK does not use ck::bhalf_t as AccDataType. But we still
+    // add this guard here
     template <
         ck::index_t NumDimM,
         ck::index_t NumDimN,
         ck::index_t NumDimK,
         typename ADataType,
         typename BDataType,
+        typename AccDataType,
         typename DsDataType,
         typename EDataType,
         typename AElementwiseOperation,
         typename BElementwiseOperation,
         typename CDEElementwiseOperation,
         typename ComputeDataType = ADataType,
-        ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2 && DsDataType::Size() <= 1,
+        ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2 && DsDataType::Size() <= 1
+                            && !std::is_same_v<AccDataType, ck::bhalf_t>,
                         bool>
         = false>
     struct ReferenceContraction_M2_N2_K2
@@ -151,7 +157,7 @@ namespace hiptensor
                 };
 
                 auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                    float accum = 0.0f;
+                    AccDataType accum = 0;
 
                     auto K0 = arg.mA_ms_ks_lengths[2];
                     auto K1 = arg.mA_ms_ks_lengths[3];
@@ -165,16 +171,19 @@ namespace hiptensor
                             auto indexB
                                 = offset(std::vector<size_t>{n0, n1, k0, k1}, arg.mB_ns_ks_strides);
 
-                            ADataType valA;
-                            BDataType valB;
+                            AccDataType valA;
+                            AccDataType valB;
 
                             // Element-wise ops
-                            arg.mOpA(valA, ((ADataType*)arg.mA)[indexA]);
-                            arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]);
+                            arg.mOpA(
+                                valA,
+                                ck::type_convert<ComputeDataType>(((ADataType*)arg.mA)[indexA]));
+                            arg.mOpB(
+                                valB,
+                                ck::type_convert<ComputeDataType>(((BDataType*)arg.mB)[indexB]));
 
                             // Mult / accum
-                            accum += ck::type_convert<float>(ck::type_convert<ComputeDataType>(
-                                ck::type_convert<float>(valA) * ck::type_convert<float>(valB)));
+                            accum += valA * valB;
                         }
                     }
 
@@ -322,6 +331,7 @@ namespace hiptensor
               ck::index_t NumDimsK,
               typename ADataType,
               typename BDataType,
+              typename AccDataType,
               typename DsDataType,
               typename EDataType,
               typename AElementwiseOperation,
@@ -333,6 +343,7 @@ namespace hiptensor
                                                     NumDimsK,
                                                     ADataType,
                                                     BDataType,
+                                                    AccDataType,
                                                     DsDataType,
                                                     EDataType,
                                                     AElementwiseOperation,
@@ -359,6 +370,7 @@ namespace hiptensor
               ck::index_t NumDimK,
               typename ADataType,
               typename BDataType,
+              typename AccDataType,
               typename DsDataType,
               typename EDataType,
               typename AElementwiseOperation,
@@ -372,6 +384,7 @@ namespace hiptensor
                                                           NumDimK,
                                                           ADataType,
                                                           BDataType,
+                                                          AccDataType,
                                                           DsDataType,
                                                           EDataType,
                                                           AElementwiseOperation,
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 173a49e9..31fb0191 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -39,6 +39,7 @@ namespace hiptensor
                                         2,
                                         ck::half_t,
                                         ck::half_t,
+                                        float,
                                         ck::Tuple<ck::half_t>,
                                         ck::half_t,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -53,6 +54,7 @@ namespace hiptensor
                                         2,
                                         ck::bhalf_t,
                                         ck::bhalf_t,
+                                        float,
                                         ck::Tuple<ck::bhalf_t>,
                                         ck::bhalf_t,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -67,6 +69,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<float>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -80,6 +83,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<float>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -93,6 +97,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<float>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -107,6 +112,7 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        float,
                                         ck::Tuple<double>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -120,6 +126,7 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        double,
                                         ck::Tuple<double>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -134,6 +141,7 @@ namespace hiptensor
                                         2,
                                         ck::half_t,
                                         ck::half_t,
+                                        float,
                                         ck::Tuple<>,
                                         ck::half_t,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -148,6 +156,7 @@ namespace hiptensor
                                         2,
                                         ck::bhalf_t,
                                         ck::bhalf_t,
+                                        float,
                                         ck::Tuple<>,
                                         ck::bhalf_t,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -162,6 +171,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -175,6 +185,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -188,6 +199,7 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
                                         ck::Tuple<>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -202,6 +214,7 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        float,
                                         ck::Tuple<>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
@@ -215,6 +228,7 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        double,
                                         ck::Tuple<>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index eee5d7f1..f4be1a88 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -29,7 +29,7 @@ Betas:
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
-  - [ 24, 18, 2, 4, 9, 1 ]
+  - [ 24, 18, 2, 4, 9, 2 ]
 Strides:
   - []
 ...
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index eee5d7f1..f4be1a88 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -29,7 +29,7 @@ Betas:
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
-  - [ 24, 18, 2, 4, 9, 1 ]
+  - [ 24, 18, 2, 4, 9, 2 ]
 Strides:
   - []
 ...
diff --git a/test/utils.hpp b/test/utils.hpp
index f39f0fb5..ad4bb565 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -140,7 +140,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqual(DDataType const* deviceD,
                                      DDataType const* hostD,
                                      std::size_t      elementsD,
-                                     double           tolerance = 0.001)
+                                     double           tolerance = 100.0)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -202,7 +202,7 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
         retval             = false;
         max_relative_error = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(max_relative_error > tolerance)
+    else if(max_relative_error > (eps * tolerance))
     {
         retval = false;
     }
@@ -214,7 +214,7 @@ template <typename DDataType>
 std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
                                                  DDataType*  hostD,
                                                  std::size_t elementsD,
-                                                 double      tolerance = 0.001)
+                                                 double      tolerance = 100.0)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);
@@ -276,12 +276,13 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
+    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
     if(isNaN)
     {
         retval           = false;
         maxRelativeError = std::numeric_limits<DDataType>::signaling_NaN();
     }
-    else if(maxRelativeError > tolerance)
+    else if(maxRelativeError > (eps * tolerance))
     {
         retval = false;
     }

From 76de7d0b89f1961b0c43c6cab8781565d6f9ad08 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Mon, 11 Dec 2023 20:09:31 +0000
Subject: [PATCH 20/42] Remove xfloat32 which is not used in hiptensor

---
 .../hiptensor/internal/native_types.hpp       |   5 -
 .../hiptensor/internal/type_traits.hpp        |  82 +----
 .../include/hiptensor/internal/xfloat32.hpp   | 334 ------------------
 3 files changed, 5 insertions(+), 416 deletions(-)
 delete mode 100644 library/include/hiptensor/internal/xfloat32.hpp

diff --git a/library/include/hiptensor/internal/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp
index 6c9dbee8..69ce706f 100644
--- a/library/include/hiptensor/internal/native_types.hpp
+++ b/library/include/hiptensor/internal/native_types.hpp
@@ -33,8 +33,6 @@
 #include <type_traits>
 #include <utility>
 
-#include "xfloat32.hpp"
-
 namespace hiptensor
 {
 
@@ -84,9 +82,6 @@ namespace hiptensor
 #if !HIPTENSOR_NO_HALF
     using hfloat16_t = __half;
 #endif // !HIPTENSOR_NO_HALF
-
-    using xfloat32_t = hiptensor_xfloat32;
-
     // clang-format off
 
 
diff --git a/library/include/hiptensor/internal/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp
index 3867839d..48566051 100644
--- a/library/include/hiptensor/internal/type_traits.hpp
+++ b/library/include/hiptensor/internal/type_traits.hpp
@@ -26,9 +26,11 @@
 
 #ifndef HIPTENSOR_TYPE_TRAITS_HPP
 #define HIPTENSOR_TYPE_TRAITS_HPP
-#include "native_types.hpp"
 #include <cfloat>
 
+#include "config.hpp"
+#include "native_types.hpp"
+
 namespace hiptensor
 {
     namespace detail
@@ -69,9 +71,8 @@ namespace hiptensor
         {
             union
             {
-                uint32_t   i32;
-                float32_t  f32;
-                xfloat32_t xf32;
+                uint32_t  i32;
+                float32_t f32;
             };
             constexpr Fp32Bits(uint32_t initVal)
                 : i32(initVal)
@@ -81,10 +82,6 @@ namespace hiptensor
                 : f32(initVal)
             {
             }
-            constexpr Fp32Bits(xfloat32_t initVal)
-                : xf32(initVal)
-            {
-            }
         };
 
     } // namespace detail
@@ -273,68 +270,6 @@ namespace std
         hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FC0));
         return eps.b16;
     }
-
-    ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<xfloat32_t>  //////////////
-    ///////////////////////////////////////////////////////////
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::epsilon() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_EPSILON));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::infinity() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(HUGE_VALF));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::lowest() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(-FLT_MAX));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::max() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MAX));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::min() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MIN));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::quiet_NaN() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF80000));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::signaling_NaN() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF00000));
-        return eps.xf32;
-    }
-    // @endcond
-
 } // namespace std
 
 namespace hiptensor
@@ -378,13 +313,6 @@ namespace hiptensor
         // b16 mantissa is 7 bits
         return ((int32_t)1 << 8);
     }
-
-    template <typename T, typename std::enable_if_t<std::is_same<T, xfloat32_t>::value, int> = 0>
-    constexpr auto maxExactInteger() -> int32_t
-    {
-        // xf32 mantissa is 7 bits
-        return ((int32_t)1 << 8);
-    }
 } // namespace hiptensor
 
 #endif // HIPTENSOR_TYPE_TRAITS_HPP
diff --git a/library/include/hiptensor/internal/xfloat32.hpp b/library/include/hiptensor/internal/xfloat32.hpp
deleted file mode 100644
index 6e9168cf..00000000
--- a/library/include/hiptensor/internal/xfloat32.hpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/* ************************************************************************
- * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
- * ies of the Software, and to permit persons to whom the Software is furnished
- * to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
- * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
- * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
- * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
- * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-/*!\file
- * \brief xfloat32.h provides struct for hiptensor_xfloat32 typedef
- */
-
-#ifndef HIPTENSOR_XFLOAT32_HPP
-#define HIPTENSOR_XFLOAT32_HPP
-
-#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-
-// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
-// include a minimal definition of hiptensor_xfloat32
-
-#include <stdint.h>
-typedef struct
-{
-    float data;
-} hiptensor_xfloat32;
-
-#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <hip/hip_runtime.h>
-#include <ostream>
-#include <type_traits>
-
-#include "config.hpp"
-
-struct hiptensor_xfloat32
-{
-    float data;
-
-    enum round_t
-    {
-        round_up
-    };
-
-    HIPTENSOR_HOST_DEVICE hiptensor_xfloat32() = default;
-
-    // round upper 19 bits of IEEE float to convert to xfloat32
-    explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f, round_t)
-        : data(float_to_xfloat32(f))
-    {
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f)
-        : data(truncate_float_to_xfloat32(f))
-    {
-    }
-
-    // zero extend lower 13 bits of xfloat32 to convert to IEEE float
-    HIPTENSOR_HOST_DEVICE operator float() const
-    {
-        return data;
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE operator bool() const
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {data};
-        return u.int32 & 0x7fffe000;
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE operator uint32_t() const
-    {
-        return uint32_t(float(*this));
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE operator long() const
-    {
-        return long(float(*this));
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE operator double() const
-    {
-        return double(float(*this));
-    }
-
-private:
-    static HIPTENSOR_HOST_DEVICE float float_to_xfloat32(float f)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {f};
-        if(~u.int32 & 0x7f800000)
-        {
-            // When the exponent bits are not all 1s, then the value is zero, normal,
-            // or subnormal. We round the xfloat32 mantissa up by adding 0xFFF, plus
-            // 1 if the least significant bit of the xfloat32 mantissa is 1 (odd).
-            // This causes the xfloat32's mantissa to be incremented by 1 if the 13
-            // least significant bits of the float mantissa are greater than 0x1000,
-            // or if they are equal to 0x1000 and the least significant bit of the
-            // xfloat32 mantissa is 1 (odd). This causes it to be rounded to even when
-            // the lower 13 bits are exactly 0x1000. If the xfloat32 mantissa already
-            // has the value 0x3ff, then incrementing it causes it to become 0x00 and
-            // the exponent is incremented by one, which is the next higher FP value
-            // to the unrounded xfloat32 value. When the xfloat32 value is subnormal
-            // with an exponent of 0x00 and a mantissa of 0x3FF, it may be rounded up
-            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-            // When the xfloat32 value has an exponent of 0xFE and a mantissa of 0x3FF,
-            // incrementing it causes it to become an exponent of 0xFF and a mantissa
-            // of 0x00, which is Inf, the next higher value to the unrounded value.
-
-            u.int32 += 0xfff + ((u.int32 >> 13) & 1); // Round to nearest, round to even
-        }
-        else if(u.int32 & 0x1fff)
-        {
-            // When all of the exponent bits are 1, the value is Inf or NaN.
-            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-            // bit being 1. Signaling NaN is indicated by the most significant
-            // mantissa bit being 0 but some other bit(s) being 1. If any of the
-            // lower 13 bits of the mantissa are 1, we set the least significant bit
-            // of the xfloat32 mantissa, in order to preserve signaling NaN in case
-            // the xfloat32's mantissa bits are all 0.
-            u.int32 |= 0x2000; // Preserve signaling NaN
-        }
-
-        u.int32 &= 0xffffe000;
-        return u.fp32;
-    }
-
-    // Truncate instead of rounding
-    static HIPTENSOR_HOST_DEVICE float truncate_float_to_xfloat32(float f)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {f};
-
-        u.int32 = u.int32 & 0xffffe000;
-        return u.fp32;
-    }
-};
-
-typedef struct
-{
-    float data;
-} hiptensor_xfloat32_public;
-
-static_assert(std::is_standard_layout<hiptensor_xfloat32>{},
-              "hiptensor_xfloat32 is not a standard layout type, and thus is "
-              "incompatible with C.");
-
-static_assert(std::is_trivial<hiptensor_xfloat32>{},
-              "hiptensor_xfloat32 is not a trivial type, and thus is "
-              "incompatible with C.");
-
-static_assert(sizeof(hiptensor_xfloat32) == sizeof(hiptensor_xfloat32_public)
-                  && offsetof(hiptensor_xfloat32, data)
-                         == offsetof(hiptensor_xfloat32_public, data),
-              "internal hiptensor_xfloat32 does not match public hiptensor_xfloat32");
-
-inline std::ostream& operator<<(std::ostream& os, const hiptensor_xfloat32& xf32)
-{
-    return os << float(xf32);
-}
-
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a)
-{
-    return a;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a)
-{
-    union
-    {
-        float    fp32;
-        uint32_t int32;
-    } u = {a.data};
-    u.int32 ^= 0x80000000;
-    return hiptensor_xfloat32(u.fp32);
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a,
-                                                          hiptensor_xfloat32 b)
-{
-    return hiptensor_xfloat32(float(a) + float(b));
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a,
-                                                          hiptensor_xfloat32 b)
-{
-    return hiptensor_xfloat32(float(a) - float(b));
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator*(hiptensor_xfloat32 a,
-                                                          hiptensor_xfloat32 b)
-{
-    return hiptensor_xfloat32(float(a) * float(b));
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator/(hiptensor_xfloat32 a,
-                                                          hiptensor_xfloat32 b)
-{
-    return hiptensor_xfloat32(float(a) / float(b));
-}
-inline HIPTENSOR_HOST_DEVICE bool operator<(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return float(a) < float(b);
-}
-inline HIPTENSOR_HOST_DEVICE bool operator==(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return float(a) == float(b);
-}
-inline HIPTENSOR_HOST_DEVICE bool operator>(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return b < a;
-}
-inline HIPTENSOR_HOST_DEVICE bool operator<=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return !(a > b);
-}
-inline HIPTENSOR_HOST_DEVICE bool operator!=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return !(a == b);
-}
-inline HIPTENSOR_HOST_DEVICE bool operator>=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return !(a < b);
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator+=(hiptensor_xfloat32& a,
-                                                            hiptensor_xfloat32  b)
-{
-    return a = a + b;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator-=(hiptensor_xfloat32& a,
-                                                            hiptensor_xfloat32  b)
-{
-    return a = a - b;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator*=(hiptensor_xfloat32& a,
-                                                            hiptensor_xfloat32  b)
-{
-    return a = a * b;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator/=(hiptensor_xfloat32& a,
-                                                            hiptensor_xfloat32  b)
-{
-    return a = a / b;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator++(hiptensor_xfloat32& a)
-{
-    return a += hiptensor_xfloat32(1.0f);
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator--(hiptensor_xfloat32& a)
-{
-    return a -= hiptensor_xfloat32(1.0f);
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator++(hiptensor_xfloat32& a, int)
-{
-    hiptensor_xfloat32 orig = a;
-    ++a;
-    return orig;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator--(hiptensor_xfloat32& a, int)
-{
-    hiptensor_xfloat32 orig = a;
-    --a;
-    return orig;
-}
-
-namespace std
-{
-    constexpr HIPTENSOR_HOST_DEVICE bool isinf(hiptensor_xfloat32 a)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {a.data};
-        return !(~u.int32 & 0x7f800000) && !(u.int32 & 0x7fe000);
-    }
-    constexpr HIPTENSOR_HOST_DEVICE bool isnan(hiptensor_xfloat32 a)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {a.data};
-        return !(~u.int32 & 0x7f800000) && +(u.int32 & 0x7fe000);
-    }
-    constexpr HIPTENSOR_HOST_DEVICE bool iszero(hiptensor_xfloat32 a)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {a.data};
-        return (u.fp32 == 0.0f);
-    }
-
-    HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 sin(hiptensor_xfloat32 a)
-    {
-        return hiptensor_xfloat32(sinf(float(a)));
-    }
-    HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 cos(hiptensor_xfloat32 a)
-    {
-        return hiptensor_xfloat32(cosf(float(a)));
-    }
-
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor_xfloat32 real(const hiptensor_xfloat32& a)
-    {
-        return a;
-    }
-}
-
-#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-
-#endif // HIPTENSOR_XFLOAT32_HPP

From 28fe756bb3858eb817540048edbe002c1c43c8f6 Mon Sep 17 00:00:00 2001
From: Bence Parajdi <bence@streamhpc.com>
Date: Tue, 12 Dec 2023 12:44:25 +0100
Subject: [PATCH 21/42] fix build warnings removed double std namespaces fix
 underlines

---
 .gitignore                                    |  1 +
 docs/Contributors_Guide.rst                   |  2 +-
 docs/Programmers_Guide.rst                    | 24 +++----
 .../internal/hiptensor-version.hpp.in         |  9 +++
 .../hiptensor/internal/type_traits.hpp        | 63 +++++++++++++++++++
 .../contraction/contraction_solution_impl.hpp |  6 +-
 .../contraction_solution_params_impl.hpp      |  4 +-
 7 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore
index ad44a303..674c60bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,4 @@ _templates/
 _toc.yml
 docBin/
 _doxygen/
+.venv
diff --git a/docs/Contributors_Guide.rst b/docs/Contributors_Guide.rst
index d75a884b..212248be 100644
--- a/docs/Contributors_Guide.rst
+++ b/docs/Contributors_Guide.rst
@@ -30,7 +30,7 @@ The hipTensor repository follows a workflow which dictates a /master branch wher
    the compute bound limit or memory bound limit.
 
 Style Guide
-==========
+===========
 
 This project follows the `CPP Core
 guidelines <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md>`__,
diff --git a/docs/Programmers_Guide.rst b/docs/Programmers_Guide.rst
index 1eaf9adf..047c1f5a 100644
--- a/docs/Programmers_Guide.rst
+++ b/docs/Programmers_Guide.rst
@@ -17,13 +17,13 @@ The `library` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
 
 `library/include/hiptensor/`
-'''''''''''''''''''''''''''
+''''''''''''''''''''''''''''
 
 Contains C++ include files for the hipTensor API. These files also contain Doxygen
 comments that document the API.
 
 `library/include/hiptensor/internal`
-''''''''''''''''''''''''''''''''''
+''''''''''''''''''''''''''''''''''''
 
 Internal include files for:
 
@@ -31,30 +31,30 @@ Internal include files for:
 - Generate Tensor Utility
 
 `library/src/`
-''''''''''''
+''''''''''''''
 
 Contains logger, device and performance functions.
 
 `library/src/contraction/`
-''''''''''''''''''''''''
+''''''''''''''''''''''''''
 
 Contains hipTensor core composable kernel header functions and contraction initialization functions.
 
 `library/src/contraction/device`
-''''''''''''''''''''''''''''''
+''''''''''''''''''''''''''''''''
 
 Contains hipTensor Bilinear and Scale instance functions
 
 The `samples` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
 `01_contraction/simple_bilinear_contraction_f32.cpp`
-''''''''''''''''''''''''''''''''''''''''''''''''''
+''''''''''''''''''''''''''''''''''''''''''''''''''''
 
 sample code for calling bilinear contraction for :code:`fp32` input, output and compute types
 
 
 `01_contraction/simple_scale_contraction_f32.cpp`
-'''''''''''''''''''''''''''''''''''''''''''''''
+'''''''''''''''''''''''''''''''''''''''''''''''''
 
 sample code for calling scale contraction for :code:`fp32` input, output and compute types
 
@@ -62,27 +62,27 @@ The `test` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
 
 `00_unit/logger`
-''''''''''''''
+''''''''''''''''
 
 Test code for testing logger API Functions of hipTensor
 
 `01_contraction/bilinear_contraction_f32`
-'''''''''''''''''''''''''''''''''''''''
+'''''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the bilinear contraction functionality and log metrics for F32 types.
 
 `01_contraction/bilinear_contraction_f64`
-'''''''''''''''''''''''''''''''''''''''
+'''''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the bilinear contraction functionality and log metrics for F64 types.
 
 `01_contraction/scale_contraction_f32`
-''''''''''''''''''''''''''''''''''''
+''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the scale contraction functionality and log metrics for F32 types.
 
 `01_contraction/scale_contraction_f64`
-''''''''''''''''''''''''''''''''''''
+''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the scale contraction functionality and log metrics for F64 types.
 
diff --git a/library/include/hiptensor/internal/hiptensor-version.hpp.in b/library/include/hiptensor/internal/hiptensor-version.hpp.in
index e1942a2b..89247375 100644
--- a/library/include/hiptensor/internal/hiptensor-version.hpp.in
+++ b/library/include/hiptensor/internal/hiptensor-version.hpp.in
@@ -38,6 +38,15 @@
 #define HIPTENSOR_PATCH_VERSION       @hiptensor_VERSION_PATCH@
 // clang-format on
 
+/**
+ * \brief Returns the version number of hipTensor
+ *
+ * \details Return the version with three least significant digits for patch version,
+ * the next three digits for minor version, and the most significant digits for major version.
+ *
+ * \returns The version number.
+ */
+ 
 inline size_t hiptensorGetVersion()
 {
     return HIPTENSOR_MAJOR_VERSION * 1e6 + HIPTENSOR_MINOR_VERSION * 1e3 + HIPTENSOR_PATCH_VERSION;
diff --git a/library/include/hiptensor/internal/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp
index 48566051..7735a5c4 100644
--- a/library/include/hiptensor/internal/type_traits.hpp
+++ b/library/include/hiptensor/internal/type_traits.hpp
@@ -93,6 +93,7 @@ namespace std
     ///////////  std::numeric_limits<float16_t>  //////////////
     ///////////////////////////////////////////////////////////
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
     template <>
     HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
         numeric_limits<hiptensor::float16_t>::epsilon() noexcept
@@ -270,6 +271,68 @@ namespace std
         hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FC0));
         return eps.b16;
     }
+
+    ///////////////////////////////////////////////////////////
+    ///////////  std::numeric_limits<xfloat32_t>  //////////////
+    ///////////////////////////////////////////////////////////
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::epsilon() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_EPSILON));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::infinity() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(HUGE_VALF));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::lowest() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(-FLT_MAX));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::max() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MAX));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::min() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MIN));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::quiet_NaN() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF80000));
+        return eps.xf32;
+    }
+
+    template <>
+    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
+        numeric_limits<hiptensor::xfloat32_t>::signaling_NaN() noexcept
+    {
+        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF00000));
+        return eps.xf32;
+    }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+    // @endcond
 } // namespace std
 
 namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp
index 3b672fbb..263937c3 100644
--- a/library/src/contraction/contraction_solution_impl.hpp
+++ b/library/src/contraction/contraction_solution_impl.hpp
@@ -35,11 +35,11 @@
 namespace std
 {
     template <>
-    struct std::hash<hiptensor::ContractionSolution>
+    struct hash<hiptensor::ContractionSolution>
     {
-        std::size_t operator()(hiptensor::ContractionSolution const& s) const noexcept
+        size_t operator()(hiptensor::ContractionSolution const& s) const noexcept
         {
-            return std::hash<hiptensor::ContractionSolutionParams>{}(*s.params());
+            return hash<hiptensor::ContractionSolutionParams>{}(*s.params());
         }
     };
 }
diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp
index b84f9c2b..3abcaede 100644
--- a/library/src/contraction/contraction_solution_params_impl.hpp
+++ b/library/src/contraction/contraction_solution_params_impl.hpp
@@ -35,9 +35,9 @@
 namespace std
 {
     template <>
-    struct std::hash<hiptensor::ContractionSolutionParams>
+    struct hash<hiptensor::ContractionSolutionParams>
     {
-        std::size_t operator()(hiptensor::ContractionSolutionParams const& s) const noexcept
+        size_t operator()(hiptensor::ContractionSolutionParams const& s) const noexcept
         {
             return hiptensor::Hash{}(s.dimsM(),
                                      s.dimsN(),

From c18335a1e81d0829f873d89c1a9b03544aed3c22 Mon Sep 17 00:00:00 2001
From: Bence Parajdi <bence@streamhpc.com>
Date: Tue, 12 Dec 2023 16:44:55 +0100
Subject: [PATCH 22/42] update doxyfile

---
 docs/.doxygen/Doxyfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/.doxygen/Doxyfile b/docs/.doxygen/Doxyfile
index 59a973b7..136d3b8c 100644
--- a/docs/.doxygen/Doxyfile
+++ b/docs/.doxygen/Doxyfile
@@ -2074,7 +2074,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = __device__
+PREDEFINED             = __device__ \
+                         DOXYGEN_SHOULD_SKIP_THIS
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The

From 7ac3fb965aed3862488b0498d6a0f8a0c33e2eb4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 12 Dec 2023 08:58:43 -0700
Subject: [PATCH 23/42] Bump cryptography from 41.0.4 to 41.0.6 in
 /docs/.sphinx (#162)

Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.4 to 41.0.6.
- [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pyca/cryptography/compare/41.0.4...41.0.6)

---
updated-dependencies:
- dependency-name: cryptography
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index b339d3e7..ce13fde5 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -26,7 +26,7 @@ charset-normalizer==3.1.0
     # via requests
 click==8.1.3
     # via sphinx-external-toc
-cryptography==41.0.4
+cryptography==41.0.6
     # via pyjwt
 deprecated==1.2.13
     # via pygithub

From b03e4f3bdfd595f5341d9da8226d9ac0b8e64b1c Mon Sep 17 00:00:00 2001
From: Bence Parajdi <bence@streamhpc.com>
Date: Wed, 13 Dec 2023 09:57:07 +0100
Subject: [PATCH 24/42] remove type traits, that sneaked back during rebase

---
 .../hiptensor/internal/type_traits.hpp        | 60 -------------------
 1 file changed, 60 deletions(-)

diff --git a/library/include/hiptensor/internal/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp
index 7735a5c4..d1329498 100644
--- a/library/include/hiptensor/internal/type_traits.hpp
+++ b/library/include/hiptensor/internal/type_traits.hpp
@@ -271,66 +271,6 @@ namespace std
         hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FC0));
         return eps.b16;
     }
-
-    ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<xfloat32_t>  //////////////
-    ///////////////////////////////////////////////////////////
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::epsilon() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_EPSILON));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::infinity() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(HUGE_VALF));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::lowest() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(-FLT_MAX));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::max() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MAX));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::min() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MIN));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::quiet_NaN() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF80000));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::signaling_NaN() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF00000));
-        return eps.xf32;
-    }
 #endif // DOXYGEN_SHOULD_SKIP_THIS
     // @endcond
 } // namespace std

From ded69b930ac83a2126061f71d7dcda6ae2c6d6a7 Mon Sep 17 00:00:00 2001
From: Bence Parajdi <bence@streamhpc.com>
Date: Thu, 14 Dec 2023 09:30:28 +0100
Subject: [PATCH 25/42] remove unnecessary endcond

---
 library/include/hiptensor/internal/type_traits.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/library/include/hiptensor/internal/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp
index d1329498..81bafacd 100644
--- a/library/include/hiptensor/internal/type_traits.hpp
+++ b/library/include/hiptensor/internal/type_traits.hpp
@@ -272,7 +272,6 @@ namespace std
         return eps.b16;
     }
 #endif // DOXYGEN_SHOULD_SKIP_THIS
-    // @endcond
 } // namespace std
 
 namespace hiptensor

From 0b34d47ca7ba6848d71867142c76cf27b2c88d27 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 20 Dec 2023 10:39:01 -0700
Subject: [PATCH 26/42] Bump rocm-docs-core from 0.30.1 to 0.30.2 in
 /docs/.sphinx (#171)

Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.30.1 to 0.30.2.
- [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases)
- [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.30.1...v0.30.2)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index ce13fde5..454c8157 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -100,7 +100,7 @@ requests==2.31.0
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.30.1
+rocm-docs-core==0.30.2
     # via -r requirements.in
 smmap==5.0.0
     # via gitdb

From 95af3c14c9950e76aafb852c0f41398eebf4abf9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 20 Dec 2023 18:08:17 +0000
Subject: [PATCH 27/42] Bump rocm-docs-core from 0.30.2 to 0.30.3 in
 /docs/.sphinx

Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.30.2 to 0.30.3.
- [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases)
- [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.30.2...v0.30.3)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index 454c8157..41817110 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -100,7 +100,7 @@ requests==2.31.0
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.30.2
+rocm-docs-core==0.30.3
     # via -r requirements.in
 smmap==5.0.0
     # via gitdb

From e41eda6d29f7a6649b1be2a915df1bd41093edab Mon Sep 17 00:00:00 2001
From: Sam Wu <sam.wu2@amd.com>
Date: Tue, 2 Jan 2024 13:28:56 -0700
Subject: [PATCH 28/42] Standardize documentation for ReadtheDocs (#176)

---
 .github/dependabot.yml                    |  2 +-
 .gitignore                                | 11 -----------
 .readthedocs.yaml                         |  6 ++----
 README.md                                 | 21 ++++++++++++--------
 docs/.gitignore                           | 12 +++++-------
 docs/.sphinx/requirements.in              |  1 -
 docs/conf.py                              | 24 +++++++++++++++++++++--
 docs/{.doxygen => doxygen}/Doxyfile       |  7 ++++---
 docs/license.rst                          |  4 ++++
 docs/{.sphinx => sphinx}/_toc.yml.in      |  3 +++
 docs/sphinx/requirements.in               |  1 +
 docs/{.sphinx => sphinx}/requirements.txt |  4 +---
 12 files changed, 56 insertions(+), 40 deletions(-)
 delete mode 100644 docs/.sphinx/requirements.in
 rename docs/{.doxygen => doxygen}/Doxyfile (99%)
 create mode 100644 docs/license.rst
 rename docs/{.sphinx => sphinx}/_toc.yml.in (84%)
 create mode 100644 docs/sphinx/requirements.in
 rename docs/{.sphinx => sphinx}/requirements.txt (98%)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 95e8b2ba..0e0a252e 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -6,7 +6,7 @@
 version: 2
 updates:
   - package-ecosystem: "pip" # See documentation for possible values
-    directory: "/docs/.sphinx" # Location of package manifests
+    directory: "/docs/sphinx" # Location of package manifests
     open-pull-requests-limit: 10
     schedule:
       interval: "daily"
diff --git a/.gitignore b/.gitignore
index 674c60bc..9945a9dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,14 +50,3 @@ build*
 \#*\#
 *~
 *.log
-
-# documentation artifacts
-build/
-_build/
-_images/
-_static/
-_templates/
-_toc.yml
-docBin/
-_doxygen/
-.venv
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index e2bf130c..9e6678ab 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,11 +10,9 @@ formats: [htmlzip, pdf, epub]
 
 python:
    install:
-   - requirements: docs/.sphinx/requirements.txt
+   - requirements: docs/sphinx/requirements.txt
 
 build:
-   os: ubuntu-20.04
+   os: ubuntu-22.04
    tools:
       python: "3.8"
-   apt_packages:
-     - "doxygen"
diff --git a/README.md b/README.md
index f5e55943..5af7912d 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ Run the steps below to build documentation locally.
 ```shell
 cd docs
 
-pip3 install -r .sphinx/requirements.txt
+pip3 install -r sphinx/requirements.txt
 
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```
@@ -98,21 +98,24 @@ After configuration, build with `cmake --build <build_dir> -- -j<nproc>`
 ### Logger tests
 
 Tests API implementation of logger verbosity and functionality.
-o <build_dir>/bin/logger_test
+
+* `<build_dir>/bin/logger_test`
 
 ## Running Contraction Tests
 
 ### Bilinear contraction tests
 
 Tests the API implementation of bilinear contraction algorithm with validation.
-o <build_dir>/bin/bilinear_contraction_f32_test
-o <build_dir>/bin/bilinear_contraction_f64_test
+
+* `<build_dir>/bin/bilinear_contraction_f32_test`
+* `<build_dir>/bin/bilinear_contraction_f64_test`
 
 ### Scale contraction tests
 
 Tests the API implementation of scale contraction algorithm with validation.
-o <build_dir>/bin/scale_contraction_f32_test
-o <build_dir>/bin/scale_contraction_f64_test
+
+* `<build_dir>/bin/scale_contraction_f32_test`
+* `<build_dir>/bin/scale_contraction_f64_test`
 
 ### Samples
 
@@ -121,12 +124,14 @@ These are stand-alone use-cases of the hipTensor contraction operations.
 ## F32 Bilinear contraction
 
 Demonstrates the API implementation of bilinear contraction operation without validation.
-o <build_dir>/bin/simple_contraction_bilinear_f32
+
+* `<build_dir>/bin/simple_contraction_bilinear_f32`
 
 ## F32 Scale contraction
 
 Demonstrates the API implementation of scale contraction operation without validation.
-o <build_dir>/bin/simple_contraction_scale_f32
+
+* `<build_dir>/bin/simple_contraction_scale_f32`
 
 ### Build Samples as external client
 
diff --git a/docs/.gitignore b/docs/.gitignore
index a44ccbe0..594c0c8c 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,7 +1,5 @@
-.doxygen/docBin
-.sphinx/_toc.yml
-_build
-_doxygen
-_images
-_static
-_templates
\ No newline at end of file
+doxygen/html
+doxygen/xml
+sphinx/_toc.yml
+_build/
+_doxygen/
diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in
deleted file mode 100644
index 313c5e94..00000000
--- a/docs/.sphinx/requirements.in
+++ /dev/null
@@ -1 +0,0 @@
-rocm-docs-core>=0.24.0
diff --git a/docs/conf.py b/docs/conf.py
index 4f00fb9e..e7e64d90 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,11 +29,31 @@
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 
+import re
+
 from rocm_docs import ROCmDocs
 
-docs_core = ROCmDocs("hipTensor Documentation")
-docs_core.run_doxygen()
+with open('../CMakeLists.txt', encoding='utf-8') as f:
+    match = re.search(r'.*\bset \( VERSION_STRING\s+\"?([0-9.]+)[^0-9.]+', f.read())
+    if not match:
+        raise ValueError("VERSION not found!")
+    version_number = match[1]
+left_nav_title = f"hipTensor {version_number} Documentation"
+
+# for PDF output on Read the Docs
+project = "hipTensor Documentation"
+author = "Advanced Micro Devices, Inc."
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
+version = version_number
+release = version_number
+
+external_toc_path = "./sphinx/_toc.yml"
+
+docs_core = ROCmDocs(left_nav_title)
+docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
 docs_core.setup()
 
+external_projects_current_project = "hiptensor"
+
 for sphinx_var in ROCmDocs.SPHINX_VARS:
     globals()[sphinx_var] = getattr(docs_core, sphinx_var)
diff --git a/docs/.doxygen/Doxyfile b/docs/doxygen/Doxyfile
similarity index 99%
rename from docs/.doxygen/Doxyfile
rename to docs/doxygen/Doxyfile
index 136d3b8c..6f96968a 100644
--- a/docs/.doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -58,7 +58,7 @@ PROJECT_LOGO           =
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = docBin
+OUTPUT_DIRECTORY       = .
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -786,7 +786,8 @@ WARN_AS_ERROR          = YES
 
 INPUT                  = ../../library/include/hiptensor \
                          ../../library/include/hiptensor/internal \
-                         ../../library/src
+                         ../../library/src \
+                         ../../README.md
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -965,7 +966,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-USE_MDFILE_AS_MAINPAGE = ../README.md
+USE_MDFILE_AS_MAINPAGE = ../../README.md
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
diff --git a/docs/license.rst b/docs/license.rst
new file mode 100644
index 00000000..141b5d3c
--- /dev/null
+++ b/docs/license.rst
@@ -0,0 +1,4 @@
+License
+=======
+
+.. include:: ../LICENSE
diff --git a/docs/.sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
similarity index 84%
rename from docs/.sphinx/_toc.yml.in
rename to docs/sphinx/_toc.yml.in
index 37b5a62b..6da76c27 100644
--- a/docs/.sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -8,3 +8,6 @@ subtrees:
     - file: API_Reference_Guide
     - file: Programmers_Guide
     - file: Contributors_Guide
+  - caption: About
+    entries:
+    - file: license
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
new file mode 100644
index 00000000..b80af261
--- /dev/null
+++ b/docs/sphinx/requirements.in
@@ -0,0 +1 @@
+rocm-docs-core==0.30.3
diff --git a/docs/.sphinx/requirements.txt b/docs/sphinx/requirements.txt
similarity index 98%
rename from docs/.sphinx/requirements.txt
rename to docs/sphinx/requirements.txt
index 41817110..81f0b559 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -84,9 +84,7 @@ pygments==2.15.0
     #   pydata-sphinx-theme
     #   sphinx
 pyjwt[crypto]==2.6.0
-    # via
-    #   pygithub
-    #   pyjwt
+    # via pygithub
 pynacl==1.5.0
     # via pygithub
 pytz==2023.3.post1

From 749644231a57063188ddf90791668619c0170200 Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Mon, 11 Dec 2023 17:49:52 -0500
Subject: [PATCH 29/42] Add API changes

- Add API changes
- Add test files
- Add samples
- Add cpu contraction for complex types
- Add complex instances
---
 .../hiptensor/internal/hiptensor_utility.hpp  |  15 +
 .../contraction_cpu_reference_impl.hpp        | 169 +++--
 .../contraction_cpu_reference_instances.cpp   |  60 ++
 .../src/contraction/contraction_pack_util.hpp | 101 +++
 .../contraction_solution_instances.cpp        |  53 ++
 library/src/contraction/device/CMakeLists.txt |   2 +
 .../device_contraction_bilinear_complex.hpp   | 595 +++++++++++++++
 ...ffle_cf32_cf32_cf32_cf32_kknn_instance.cpp | 105 +++
 .../device_contraction_scale_complex.hpp      | 699 ++++++++++++++++++
 ..._c_shuffle_cf32_cf32_cf32_kkn_instance.cpp | 105 +++
 library/src/data_types.cpp                    |  16 +-
 library/src/hiptensor.cpp                     |   3 +-
 library/src/include/data_types.hpp            |   1 +
 library/src/include/data_types_impl.hpp       |  20 +
 samples/01_contraction/CMakeLists.txt         |   2 +
 ...action_cf32_cf32_cf32_cf32_compute_f32.cpp |  57 ++
 ...contraction_cf32_cf32_cf32_compute_f32.cpp |  57 ++
 test/00_unit/yaml_test.cpp                    |   4 +
 test/01_contraction/contraction_test.cpp      |  88 ++-
 test/device/common.hpp                        |  17 +-
 test/llvm/yaml_parser_config.cpp              |   2 +
 21 files changed, 2104 insertions(+), 67 deletions(-)
 create mode 100644 library/src/contraction/contraction_pack_util.hpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_complex.hpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_complex.hpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
 create mode 100644 samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp
 create mode 100644 samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp

diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp
index c386bbe0..746f1bbf 100644
--- a/library/include/hiptensor/internal/hiptensor_utility.hpp
+++ b/library/include/hiptensor/internal/hiptensor_utility.hpp
@@ -29,6 +29,7 @@
 #include <fstream>
 #include <hip/hip_runtime.h>
 #include <iostream>
+#include <hip/hip_complex.h>
 
 #include "../hiptensor_types.hpp"
 #include "types_ext.hpp"
@@ -61,6 +62,20 @@
     }
 #endif
 
+inline std::ostream& operator<<(std::ostream& os, const hipFloatComplex& fc)
+{
+    std::string seperator = (hipCimagf(fc) >= 0) ? " + " : "";
+
+    return os << hipCrealf(fc) << seperator << hipCimagf(fc) << "i";
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipDoubleComplex& dc)
+{
+    std::string seperator = (hipCimag(dc) >= 0) ? " + " : "";
+
+    return os << hipCreal(dc) << seperator << hipCimag(dc) << "i";
+}
+
 template <typename T>
 void hiptensorPrintArrayElements(std::ostream& stream, T* vec, size_t size)
 {
diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index 2e3d0cbe..25c317e3 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -156,62 +156,127 @@ namespace hiptensor
                         indices.begin(), indices.end(), strides.begin(), std::size_t{0});
                 };
 
-                auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                    AccDataType accum = 0;
+                if constexpr((std::is_same_v<ADataType, hipFloatComplex> &&
+                              std::is_same_v<BDataType, hipFloatComplex> &&
+                              std::is_same_v<EDataType, hipFloatComplex>) ||
+                              (std::is_same_v<ADataType, hipDoubleComplex> &&
+                               std::is_same_v<BDataType, hipDoubleComplex> &&
+                               std::is_same_v<EDataType, hipDoubleComplex>))
+                {
+                    auto f_ms_ns_complex = [&](auto m0, auto m1, auto n0, auto n1) {
+                            HIP_vector_type<AccDataType, 2> accum{0};
+
+                            auto K0 = arg.mA_ms_ks_lengths[2];
+                            auto K1 = arg.mA_ms_ks_lengths[3];
+
+                            for(size_t k0 = 0; k0 < K0; k0++)
+                            {
+                                for(size_t k1 = 0; k1 < K1; k1++)
+                                {
+                                    auto indexA
+                                        = offset(std::vector<size_t>{m0, m1, k0, k1}, arg.mA_ms_ks_strides);
+                                    auto indexB
+                                        = offset(std::vector<size_t>{n0, n1, k0, k1}, arg.mB_ns_ks_strides);
+
+                                    ADataType valA = ((ADataType*)arg.mA)[indexA];
+                                    BDataType valB = ((BDataType*)arg.mB)[indexB];
+
+                                    // Mult / accum
+                                    if constexpr(std::is_same_v<AccDataType, float>)
+                                    {
+                                        accum = hipCaddf(accum, hipCmulf(valA, valB));
+                                    }
+                                    else if constexpr(std::is_same_v<AccDataType, double>)
+                                    {
+                                        accum = hipCadd(accum, hipCmul(valA, valB));
+                                    }
+                                }
+                            }
+
+                            auto indexE = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mE_ms_ns_strides);
+
+                            if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                        ck::tensor_operation::element_wise::Scale>)
+                            {
+                                ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.scale_ * (EDataType)accum;
+                            }
+                            else // bilinear
+                            {
+                                // NumDTensor will be 1 due to SFINAE of this class
+                                auto indexD
+                                    = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
+
+                                ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.alpha_ * (EDataType)accum +
+                                                               arg.mOpCDE.beta_ * ((EDataType*)(arg.mD[0]))[indexD];
+                            }
+                        };
+
+                    make_ParallelTensorFunctor(f_ms_ns_complex,
+                                               arg.mE_ms_ns_lengths[0],
+                                               arg.mE_ms_ns_lengths[1],
+                                               arg.mE_ms_ns_lengths[2],
+                                               arg.mE_ms_ns_lengths[3])(
+                        std::thread::hardware_concurrency());
+                }
+                else
+                {
+                    auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                        AccDataType accum = 0;
+
+                        auto K0 = arg.mA_ms_ks_lengths[2];
+                        auto K1 = arg.mA_ms_ks_lengths[3];
+
+                        for(size_t k0 = 0; k0 < K0; k0++)
+                        {
+                            for(size_t k1 = 0; k1 < K1; k1++)
+                            {
+                                auto indexA
+                                    = offset(std::vector<size_t>{m0, m1, k0, k1}, arg.mA_ms_ks_strides);
+                                auto indexB
+                                    = offset(std::vector<size_t>{n0, n1, k0, k1}, arg.mB_ns_ks_strides);
+
+                                AccDataType valA;
+                                AccDataType valB;
+
+                                // Element-wise ops
+                                arg.mOpA(
+                                    valA,
+                                    ck::type_convert<ComputeDataType>(((ADataType*)arg.mA)[indexA]));
+                                arg.mOpB(
+                                    valB,
+                                    ck::type_convert<ComputeDataType>(((BDataType*)arg.mB)[indexB]));
+
+                                // Mult / accum
+                                accum += valA * valB;
+                            }
+                        }
 
-                    auto K0 = arg.mA_ms_ks_lengths[2];
-                    auto K1 = arg.mA_ms_ks_lengths[3];
+                        auto indexE = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mE_ms_ns_strides);
 
-                    for(size_t k0 = 0; k0 < K0; k0++)
-                    {
-                        for(size_t k1 = 0; k1 < K1; k1++)
+                        if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                    ck::tensor_operation::element_wise::Scale>)
                         {
-                            auto indexA
-                                = offset(std::vector<size_t>{m0, m1, k0, k1}, arg.mA_ms_ks_strides);
-                            auto indexB
-                                = offset(std::vector<size_t>{n0, n1, k0, k1}, arg.mB_ns_ks_strides);
-
-                            AccDataType valA;
-                            AccDataType valB;
-
-                            // Element-wise ops
-                            arg.mOpA(
-                                valA,
-                                ck::type_convert<ComputeDataType>(((ADataType*)arg.mA)[indexA]));
-                            arg.mOpB(
-                                valB,
-                                ck::type_convert<ComputeDataType>(((BDataType*)arg.mB)[indexB]));
-
-                            // Mult / accum
-                            accum += valA * valB;
+                            arg.mOpCDE(((EDataType*)arg.mE)[indexE],
+                                    ck::type_convert<EDataType>(accum));
                         }
-                    }
-
-                    auto indexE = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mE_ms_ns_strides);
-
-                    if constexpr(std::is_same_v<CDEElementwiseOperation,
-                                                ck::tensor_operation::element_wise::Scale>)
-                    {
-                        arg.mOpCDE(((EDataType*)arg.mE)[indexE],
-                                   ck::type_convert<EDataType>(accum));
-                    }
-                    else // bilinear
-                    {
-                        // NumDTensor will be 1 due to SFINAE of this class
-                        auto indexD
-                            = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
-                        arg.mOpCDE(((EDataType*)arg.mE)[indexE],
-                                   ck::type_convert<EDataType>(accum),
-                                   ((EDataType*)(arg.mD[0]))[indexD]);
-                    }
-                };
-
-                make_ParallelTensorFunctor(f_ms_ns,
-                                           arg.mE_ms_ns_lengths[0],
-                                           arg.mE_ms_ns_lengths[1],
-                                           arg.mE_ms_ns_lengths[2],
-                                           arg.mE_ms_ns_lengths[3])(
-                    std::thread::hardware_concurrency());
+                        else // bilinear
+                        {
+                            // NumDTensor will be 1 due to SFINAE of this class
+                            auto indexD
+                                = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
+                            arg.mOpCDE(((EDataType*)arg.mE)[indexE],
+                                    ck::type_convert<EDataType>(accum),
+                                    ((EDataType*)(arg.mD[0]))[indexD]);
+                        }
+                    };
+
+                    make_ParallelTensorFunctor(f_ms_ns,
+                                               arg.mE_ms_ns_lengths[0],
+                                               arg.mE_ms_ns_lengths[1],
+                                               arg.mE_ms_ns_lengths[2],
+                                               arg.mE_ms_ns_lengths[3])(
+                        std::thread::hardware_concurrency());
+                }
 
                 return 0;
             }
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 31fb0191..68b4ad1b 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -105,6 +105,21 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::Bilinear,
                                         float>());
 
+        // Bilinear complex f32
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        float,
+                                        ck::Tuple<hipFloatComplex>,
+                                        hipFloatComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
         // Bilinear f64
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -134,6 +149,21 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::Bilinear,
                                         double>());
 
+        // Bilinear complex f64
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        double,
+                                        ck::Tuple<hipDoubleComplex>,
+                                        hipDoubleComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        double>());
+
         // Scale f16
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -207,6 +237,21 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::Scale,
                                         float>());
 
+        // Scale complex f32
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        float,
+                                        ck::Tuple<>,
+                                        hipFloatComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
         // Scale f64
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -235,5 +280,20 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::Scale,
                                         double>());
+
+        // Scale complex f64
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        double,
+                                        ck::Tuple<>,
+                                        hipDoubleComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        double>());
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp
new file mode 100644
index 00000000..49741547
--- /dev/null
+++ b/library/src/contraction/contraction_pack_util.hpp
@@ -0,0 +1,101 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_CONTRACTION_PACK_UTIL_HPP
+#define HIPTENSOR_CONTRACTION_PACK_UTIL_HPP
+
+#include "data_types.hpp"
+#include "util.hpp"
+#include <hiptensor/hiptensor.hpp>
+
+namespace hiptensor
+{
+    /**
+     * \brief This function unpacks structured data (hipFloatComplex / hipDoubleComplex)
+     *        into non-structured data (float / double).
+     */
+    template<typename InputType, typename OutputType>
+    __global__ void unpack(const InputType* in, OutputType* out_real, OutputType *out_img, int length)
+    {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(idx < length)
+        {
+            if constexpr(std::is_same_v<InputType,hipFloatComplex>)
+            {
+                out_real[idx] = hipCrealf(in[idx]);
+                out_img[idx] = hipCimagf(in[idx]);
+            }
+            else if constexpr(std::is_same_v<InputType,hipDoubleComplex>)
+            {
+                out_real[idx] = hipCreal(in[idx]);
+                out_img[idx] = hipCimag(in[idx]);
+            }
+        }
+    }
+
+    /**
+     * \brief This function packs non-structured data (float / double)
+     *        into structured data (hipFloatComplex / hipDoubleComplex).
+     */
+    template<typename InputType, typename OutputType>
+    __global__ void pack(const InputType* in_real, InputType* in_img, OutputType *out, int length)
+    {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(idx < length)
+        {
+            if constexpr(std::is_same_v<OutputType, hipFloatComplex>)
+            {
+                out[idx] = make_hipFloatComplex((float)in_real[idx], (float)in_img[idx]);
+            }
+            else if constexpr(std::is_same_v<OutputType, hipDoubleComplex>)
+            {
+                out[idx] = make_hipDoubleComplex((double)in_real[idx], (double)in_img[idx]);
+            }
+        }
+    }
+ 
+    struct DeviceDeleter
+    {
+        void operator()(void* ptr)
+        {
+            CHECK_HIP_ERROR(hipFree(ptr));
+        }
+    };
+
+    template<typename T>
+    auto allocDevice(int64_t numElements)
+    {
+        T* data;
+        CHECK_HIP_ERROR(hipMalloc(&data, numElements));
+        return std::unique_ptr<T, DeviceDeleter>(data, DeviceDeleter());
+    }
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_CONTRACTION_PACK_UTIL_HPP
+
diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index aec12e32..65ed8f34 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -101,6 +101,19 @@ namespace hiptensor
                                           ck::tensor_operation::element_wise::Bilinear,
                                           ck::bhalf_t>());
 
+        // Bilinear complex f32
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          hipFloatComplex,
+                                          hipFloatComplex,
+                                          ck::Tuple<hipFloatComplex>,
+                                          hipFloatComplex,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear>());
+
         // Bilinear f64
         registerSolutions(
             enumerateContractionSolutions<2,
@@ -127,6 +140,19 @@ namespace hiptensor
                                           ck::tensor_operation::element_wise::Bilinear,
                                           double>());
 
+        // Bilinear complex f64
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          hipDoubleComplex,
+                                          hipDoubleComplex,
+                                          ck::Tuple<hipDoubleComplex>,
+                                          hipDoubleComplex,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear>());
+
         // Scale bf16
         registerSolutions(
             enumerateContractionSolutions<2,
@@ -194,6 +220,20 @@ namespace hiptensor
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::Scale,
                                           ck::bhalf_t>());
+
+        // scale complex f32
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          hipFloatComplex,
+                                          hipFloatComplex,
+                                          ck::Tuple<>,
+                                          hipFloatComplex,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale>());
+
         // Scale f64
         registerSolutions(
             enumerateContractionSolutions<2,
@@ -220,5 +260,18 @@ namespace hiptensor
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::Scale,
                                           double>());
+        // scale complex f64
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          hipDoubleComplex,
+                                          hipDoubleComplex,
+                                          ck::Tuple<>,
+                                          hipDoubleComplex,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale>());
+
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt
index eacac5b1..17bff3ca 100644
--- a/library/src/contraction/device/CMakeLists.txt
+++ b/library/src/contraction/device/CMakeLists.txt
@@ -45,6 +45,7 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
@@ -73,6 +74,7 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
new file mode 100644
index 00000000..2ffc9559
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -0,0 +1,595 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP
+#define HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP
+
+#include "../contraction_pack_util.hpp"
+#include "common.hpp"
+#include <hip/hip_complex.h>
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+
+            using hiptensor::allocDevice;
+            using hiptensor::ceilDiv;
+            using hiptensor::DeviceDeleter;
+            using hiptensor::elementSpaceFromLengthsAndStrides;
+
+            using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+
+            // The following is a specialization class for bilinear contractions of complex types.
+            // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
+            // the complex element type.
+            // The class implements a CK interface to wrap the 4 individual contraction operations and argument
+            // handling internally.
+            // Note: We are assuming that the data comes in as an Array of Structures (AOS) format in complex pairs.
+            // The argument initialization portion decomposes this data into structure of arrays (SOA) where the
+            // real and complex elements can be operated on separately.
+
+            // Tensor Contraction:
+            //   input : A
+            //   input : B
+            //   input : D0, D1, ...
+            //   output : E
+            //   C = a_op(A) * b_op(B)
+            //   E = cde_op(C, D0, D1, ...)
+            // Assume:
+            //   A[M0, M1, M2, ..., K0, K1, K2, ...]
+            //   B[N0, N1, N2, ..., K0, K1, K2, ...]
+            //   D[M0, M1, M2, ..., N0, N1, N2, ...]
+            //   E[M0, M1, M2, ..., N0, N1, N2, ...]
+            template <index_t NumDimM,
+                      index_t NumDimN,
+                      index_t NumDimK,
+                      typename ADataType,
+                      typename BDataType,
+                      typename AccDataType,
+                      typename CShuffleDataType,
+                      typename DsDataType,
+                      typename EDataType,
+                      typename AElementwiseOperation,
+                      typename BElementwiseOperation,
+                      GemmSpecialization GemmSpec,
+                      index_t            NumGemmKPrefetchStage,
+                      index_t            BlockSize,
+                      index_t            MPerBlock,
+                      index_t            NPerBlock,
+                      index_t            KPerBlock,
+                      index_t            AK1,
+                      index_t            BK1,
+                      index_t            MPerXDL,
+                      index_t            NPerXDL,
+                      index_t            MXdlPerWave,
+                      index_t            NXdlPerWave,
+                      typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                      typename ABlockTransferThreadClusterArrangeOrder,
+                      typename ABlockTransferSrcAccessOrder,
+                      index_t ABlockTransferSrcVectorDim,
+                      index_t ABlockTransferSrcScalarPerVector,
+                      index_t ABlockTransferDstScalarPerVector_AK1,
+                      bool    ABlockLdsExtraM,
+                      typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                      typename BBlockTransferThreadClusterArrangeOrder,
+                      typename BBlockTransferSrcAccessOrder,
+                      index_t BBlockTransferSrcVectorDim,
+                      index_t BBlockTransferSrcScalarPerVector,
+                      index_t BBlockTransferDstScalarPerVector_BK1,
+                      bool    BBlockLdsExtraN,
+                      index_t CShuffleMXdlPerWavePerShuffle,
+                      index_t CShuffleNXdlPerWavePerShuffle,
+                      typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                      index_t CDEBlockTransferScalarPerVector_NPerBlock,
+                      typename ComputeDataType,
+                      LoopScheduler LoopSched>
+            struct DeviceContractionMultipleD_Xdl_CShuffle<
+                NumDimM,
+                NumDimN,
+                NumDimK,
+                HIP_vector_type<ADataType, 2>,
+                HIP_vector_type<BDataType, 2>,
+                AccDataType,
+                CShuffleDataType,
+                ck::Tuple<HIP_vector_type<DsDataType, 2>>,
+                HIP_vector_type<EDataType, 2>,
+                AElementwiseOperation,
+                BElementwiseOperation,
+                Bilinear,
+                GemmSpec,
+                NumGemmKPrefetchStage,
+                BlockSize,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                AK1,
+                BK1,
+                MPerXDL,
+                NPerXDL,
+                MXdlPerWave,
+                NXdlPerWave,
+                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                ABlockTransferThreadClusterArrangeOrder,
+                ABlockTransferSrcAccessOrder,
+                ABlockTransferSrcVectorDim,
+                ABlockTransferSrcScalarPerVector,
+                ABlockTransferDstScalarPerVector_AK1,
+                ABlockLdsExtraM,
+                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                BBlockTransferThreadClusterArrangeOrder,
+                BBlockTransferSrcAccessOrder,
+                BBlockTransferSrcVectorDim,
+                BBlockTransferSrcScalarPerVector,
+                BBlockTransferDstScalarPerVector_BK1,
+                BBlockLdsExtraN,
+                CShuffleMXdlPerWavePerShuffle,
+                CShuffleNXdlPerWavePerShuffle,
+                CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                CDEBlockTransferScalarPerVector_NPerBlock,
+                ComputeDataType,
+                LoopSched>
+
+                : public DeviceContractionMultipleD<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    HIP_vector_type<ADataType, 2>,
+                                                    HIP_vector_type<BDataType, 2>,
+                                                    ck::Tuple<HIP_vector_type<DsDataType, 2>>,
+                                                    HIP_vector_type<EDataType, 2>,
+                                                    AElementwiseOperation,
+                                                    BElementwiseOperation,
+                                                    Bilinear,
+                                                    ComputeDataType>
+            {
+                // Complex device Op
+                using DeviceOp                = DeviceContractionMultipleD_Xdl_CShuffle;
+                using CDEElementwiseOperation = Bilinear;
+
+                // Complex types given through the interface
+                using ComplexA  = HIP_vector_type<ADataType, 2>;
+                using ComplexB  = HIP_vector_type<BDataType, 2>;
+                using ComplexDs = HIP_vector_type<DsDataType, 2>;
+                using ComplexE  = HIP_vector_type<EDataType, 2>;
+
+                // Internal functional types we will use to
+                // decompose the complex types and operate on.
+                using DecompA  = ADataType;
+                using DecompB  = BDataType;
+                using DecompDs = DsDataType;
+                using DecompE  = EDataType;
+
+                // For complex types, we need to make sure that all of the types are the same
+                static_assert(std::is_same_v<DecompA, DecompB> && std::is_same_v<DecompB, DecompDs>
+                                  && std::is_same_v<DecompDs, DecompE>
+                                  && std::is_same_v<DecompE, ComputeDataType>
+                                  && std::is_same_v<ComputeDataType, CShuffleDataType>,
+                              "Complex operations must have the same data type");
+
+                static_assert(std::is_same_v<DecompA, float> || std::is_same_v<DecompA, double>,
+                              "Complex operations only supported with single or double precision");
+
+                static constexpr index_t NumDTensor = 1;
+
+                // The internal operation that we will decompose the complex operations with.
+                // For complex will be either float or double
+                using DecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    DecompA,
+                    DecompB,
+                    AccDataType,
+                    CShuffleDataType,
+                    ck::Tuple<DecompDs>,
+                    DecompE,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    GemmSpec,
+                    NumGemmKPrefetchStage,
+                    BlockSize,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    AK1,
+                    BK1,
+                    MPerXDL,
+                    NPerXDL,
+                    MXdlPerWave,
+                    NXdlPerWave,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    ABlockTransferSrcVectorDim,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    ABlockLdsExtraM,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    BBlockTransferSrcVectorDim,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    BBlockLdsExtraN,
+                    CShuffleMXdlPerWavePerShuffle,
+                    CShuffleNXdlPerWavePerShuffle,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    CDEBlockTransferScalarPerVector_NPerBlock,
+                    ComputeDataType,
+                    LoopSched>;
+
+                // Argument
+                struct Argument : public BaseArgument
+                {
+                    using DecompArg = typename DecompOp::Argument;
+
+                    Argument(Argument&& other)
+                        : mArgs({std::move(other.mArgs[0]),
+                                 std::move(other.mArgs[1]),
+                                 std::move(other.mArgs[2]),
+                                 std::move(other.mArgs[3])})
+                    {
+                    }
+
+                    Argument& operator=(Argument&& other)
+                    {
+                        if(this != &other)
+                        {
+                            mArgs[0] = std::move(other.mArgs[0]);
+                            mArgs[1] = std::move(other.mArgs[1]);
+                            mArgs[2] = std::move(other.mArgs[2]);
+                            mArgs[3] = std::move(other.mArgs[3]);
+                        }
+                        return *this;
+                    }
+
+                    Argument(const void*                                         p_a_grid,
+                             const void*                                         p_b_grid,
+                             std::array<const void*, NumDTensor>                 p_ds_grid,
+                             void*                                               p_e_grid,
+                             const std::vector<index_t>&                         a_ms_ks_lengths,
+                             const std::vector<index_t>&                         a_ms_ks_strides,
+                             const std::vector<index_t>&                         b_ns_ks_lengths,
+                             const std::vector<index_t>&                         b_ns_ks_strides,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                             const std::vector<index_t>&                         e_ms_ns_lengths,
+                             const std::vector<index_t>&                         e_ms_ns_strides,
+                             AElementwiseOperation                               a_element_op,
+                             BElementwiseOperation                               b_element_op,
+                             CDEElementwiseOperation                             cde_element_op)
+                    {
+                        // Take the incoming arguments, treat them as complex.
+
+                        // Allocate Real and Imaginary inputs
+                        auto elementsA
+                            = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides);
+                        auto elementsB
+                            = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides);
+                        auto elementsD = elementSpaceFromLengthsAndStrides(ds_ms_ns_lengths[0],
+                                                                           ds_ms_ns_strides[0]);
+                        auto elementsE
+                            = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
+
+                        mA_real.reset(nullptr);
+                        mA_imag.reset(nullptr);
+                        mB_real.reset(nullptr);
+                        mB_imag.reset(nullptr);
+                        mD_real.reset(nullptr);
+                        mD_imag.reset(nullptr);
+                        mE_real.reset(nullptr);
+                        mE_imag.reset(nullptr);
+
+                        auto blockDim = dim3(1024);
+
+                        auto decompGrid = [blockDim](auto&       out_r,
+                                                     auto&       out_i,
+                                                     auto const* input_grid,
+                                                     uint32_t    elementCount) {
+                            using DecompT = typename std::decay_t<decltype(out_r)>::element_type;
+                            static_assert(std::is_same_v<
+                                              DecompT,
+                                              typename std::decay_t<decltype(out_i)>::element_type>,
+                                          "r and i buffers must be same type");
+
+                            if(input_grid != nullptr)
+                            {
+                                out_r = std::move(allocDevice<DecompT>(elementCount));
+                                out_i = std::move(allocDevice<DecompT>(elementCount));
+
+                                auto gridDim = dim3(ceilDiv(elementCount, blockDim.x));
+                                hiptensor::unpack<<<gridDim, blockDim, 0>>>(
+                                    input_grid, out_r.get(), out_i.get(), elementCount);
+                            }
+                        };
+
+                        decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA);
+                        decompGrid(mB_real, mB_imag, (const ComplexA*)p_b_grid, elementsB);
+                        decompGrid(mD_real, mD_imag, (const ComplexA*)p_ds_grid[0], elementsD);
+                        decompGrid(mE_real, mE_imag, (const ComplexA*)p_e_grid, elementsE);
+
+                        auto allocArgs = [a_ms_ks_lengths,
+                                          a_ms_ks_strides,
+                                          b_ns_ks_lengths,
+                                          b_ns_ks_strides,
+                                          ds_ms_ns_lengths,
+                                          ds_ms_ns_strides,
+                                          e_ms_ns_lengths,
+                                          e_ms_ns_strides,
+                                          a_element_op,
+                                          b_element_op](auto&       out_e,
+                                                        auto const& in_a,
+                                                        auto const& in_b,
+                                                        auto const& in_d,
+                                                        auto const& cde_element_op) {
+                            return std::make_unique<typename DecompOp::Argument>(
+                                in_a.get(),
+                                in_b.get(),
+                                std::array<void const*, 1>{in_d.get()},
+                                out_e.get(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                ds_ms_ns_lengths,
+                                ds_ms_ns_strides,
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+                        };
+
+                        mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op);
+                        mArgs[1] = allocArgs(mE_real,
+                                             mA_imag,
+                                             mB_imag,
+                                             mE_real,
+                                             CDEElementwiseOperation{cde_element_op.alpha_ * -1.0f,
+                                                                     cde_element_op.beta_});
+                        mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op);
+                        mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, cde_element_op);
+                    }
+
+                    void Print() const
+                    {
+                        std::cout << "Args0:" << std::endl;
+                        mArgs[0]->Print();
+                        std::cout << "Args1:" << std::endl;
+                        mArgs[1]->Print();
+                        std::cout << "Args2:" << std::endl;
+                        mArgs[2]->Print();
+                        std::cout << "Args3:" << std::endl;
+                        mArgs[3]->Print();
+                    }
+
+                    //  private:
+                    // Each argument set for complex:
+                    std::unique_ptr<typename DecompOp::Argument> mArgs[4];
+
+                    template <typename DataT>
+                    using DeviceArray = std::unique_ptr<DataT, DeviceDeleter>;
+
+                    // Manage extra memory for AOS->SOA
+                    DeviceArray<DecompA>  mA_real;
+                    DeviceArray<DecompA>  mA_imag;
+                    DeviceArray<DecompB>  mB_real;
+                    DeviceArray<DecompB>  mB_imag;
+                    DeviceArray<DecompDs> mD_real;
+                    DeviceArray<DecompDs> mD_imag;
+                    DeviceArray<DecompE>  mE_real;
+                    DeviceArray<DecompE>  mE_imag;
+                };
+
+                // Invoker
+                struct Invoker : public BaseInvoker
+                {
+                    using Argument = typename DeviceOp::Argument;
+
+                    Invoker()
+                        : mInvoker(std::make_unique<typename DecompOp::Invoker>())
+                    {
+                    }
+
+                    Invoker(Invoker&& other)
+                        : mInvoker(std::move(other.mInvoker))
+                    {
+                    }
+
+                    Invoker& operator=(Invoker&& other)
+                    {
+                        if(this != &other)
+                        {
+                            mInvoker = std::move(other.mInvoker);
+                        }
+                        return *this;
+                    }
+
+                    float Run(const Argument&     arg,
+                              const StreamConfig& stream_config = StreamConfig{})
+                    {
+                        auto r0 = mInvoker->Run(arg.mArgs[0].get(), stream_config);
+                        auto r1 = mInvoker->Run(arg.mArgs[1].get(), stream_config);
+                        auto r2 = mInvoker->Run(arg.mArgs[2].get(), stream_config);
+                        auto r3 = mInvoker->Run(arg.mArgs[3].get(), stream_config);
+
+                        // Reduce results?
+                        return r3;
+                    }
+
+                    // polymorphic
+                    float Run(const BaseArgument* p_arg,
+                              const StreamConfig& stream_config = StreamConfig{}) override
+                    {
+                        return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+                    }
+
+                    std::unique_ptr<typename DecompOp::Invoker> mInvoker;
+                };
+
+                static bool IsSupportedArgument(const Argument& arg)
+                {
+                    return DecompOp::IsSupportedArgument(*(arg.mArgs[0].get()))
+                           && DecompOp::IsSupportedArgument(*(arg.mArgs[1].get()))
+                           && DecompOp::IsSupportedArgument(*(arg.mArgs[2].get()))
+                           && DecompOp::IsSupportedArgument(*(arg.mArgs[3].get()));
+                }
+
+                // polymorphic
+                bool IsSupportedArgument(const BaseArgument* p_arg) override
+                {
+                    return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+                }
+
+                // polymorphic
+                virtual void SetWorkSpacePointer(BaseArgument*       p_arg,
+                                                 void*               p_workspace,
+                                                 StreamConfig const& s
+                                                 = StreamConfig{}) const override
+                {
+                    // Call the base, then fwd to each arg.
+                    this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s);
+                    auto* arg = dynamic_cast<Argument*>(p_arg);
+                    this->BaseOperator::SetWorkSpacePointer(arg->mArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(arg->mArgs[1].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(arg->mArgs[2].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(arg->mArgs[3].get(), p_workspace, s);
+                }
+
+                static auto MakeArgument(
+                    const void*                                         p_a,
+                    const void*                                         p_b,
+                    std::array<const void*, NumDTensor>                 p_ds,
+                    void*                                               p_e,
+                    const std::vector<index_t>&                         a_ms_ks_lengths,
+                    const std::vector<index_t>&                         a_ms_ks_strides,
+                    const std::vector<index_t>&                         b_ns_ks_lengths,
+                    const std::vector<index_t>&                         b_ns_ks_strides,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                    const std::vector<index_t>&                         e_ms_ns_lengths,
+                    const std::vector<index_t>&                         e_ms_ns_strides,
+                    AElementwiseOperation                               a_element_op,
+                    BElementwiseOperation                               b_element_op,
+                    CDEElementwiseOperation                             cde_element_op)
+                {
+                    return Argument{p_a,
+                                    p_b,
+                                    p_ds,
+                                    p_e,
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    ds_ms_ns_lengths,
+                                    ds_ms_ns_strides,
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op};
+                }
+
+                static auto MakeInvoker()
+                {
+                    return Invoker{};
+                }
+
+                // polymorphic
+                std::unique_ptr<BaseArgument> MakeArgumentPointer(
+                    const void*                                         p_a,
+                    const void*                                         p_b,
+                    std::array<const void*, NumDTensor>                 p_ds,
+                    void*                                               p_e,
+                    const std::vector<index_t>&                         a_ms_ks_lengths,
+                    const std::vector<index_t>&                         a_ms_ks_strides,
+                    const std::vector<index_t>&                         b_ns_ks_lengths,
+                    const std::vector<index_t>&                         b_ns_ks_strides,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                    const std::vector<index_t>&                         e_ms_ns_lengths,
+                    const std::vector<index_t>&                         e_ms_ns_strides,
+                    AElementwiseOperation                               a_element_op,
+                    BElementwiseOperation                               b_element_op,
+                    CDEElementwiseOperation                             cde_element_op) override
+                {
+                    return std::make_unique<Argument>(p_a,
+                                                      p_b,
+                                                      p_ds,
+                                                      p_e,
+                                                      a_ms_ks_lengths,
+                                                      a_ms_ks_strides,
+                                                      b_ns_ks_lengths,
+                                                      b_ns_ks_strides,
+                                                      ds_ms_ns_lengths,
+                                                      ds_ms_ns_strides,
+                                                      e_ms_ns_lengths,
+                                                      e_ms_ns_strides,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      cde_element_op);
+                }
+
+                // polymorphic
+                std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+                {
+                    return std::make_unique<Invoker>(Invoker{});
+                }
+
+                // polymorphic
+                std::string GetTypeString() const override
+                {
+                    auto str = std::stringstream();
+
+                    // clang-format off
+        str << "DeviceContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+                    // clang-format on
+
+                    return str.str();
+                }
+            };
+
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
+
+#endif // HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
new file mode 100644
index 00000000..fce71e8f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
@@ -0,0 +1,105 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather
+// than using default setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter
+// of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32        = float;
+                using CF32       = hipFloatComplex;
+                using CF32_Tuple = ck::Tuple<CF32>;
+
+                template <ck::index_t... Is>
+                using S = ck::Sequence<Is...>;
+
+                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+                static constexpr auto GemmMNKPadding
+                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1]  D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance
+                    = std::tuple<
+                        // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4,            float>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4,            float>
+                        // clang-format on
+                        >;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               float>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
+
diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp
new file mode 100644
index 00000000..96531ddd
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_complex.hpp
@@ -0,0 +1,699 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP
+#define HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP
+
+#include "../contraction_pack_util.hpp"
+#include "common.hpp"
+#include <hip/hip_complex.h>
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+
+            using hiptensor::allocDevice;
+            using hiptensor::ceilDiv;
+            using hiptensor::DeviceDeleter;
+            using hiptensor::elementSpaceFromLengthsAndStrides;
+
+            using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+            using Scale    = ck::tensor_operation::element_wise::Scale;
+
+            // The following is a specialization class for bilinear contractions of complex types.
+            // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
+            // the complex element type.
+            // The class implements a CK interface to wrap the 4 individual contraction operations and argument
+            // handling internally.
+            // Note: We are assuming that the data comes in as an Array of Structures (AOS) format in complex pairs.
+            // The argument initialization portion decomposes this data into structure of arrays (SOA) where the
+            // real and complex elements can be operated on separately.
+
+            // Tensor Contraction:
+            //   input : A
+            //   input : B
+            //   input : D0, D1, ...
+            //   output : E
+            //   C = a_op(A) * b_op(B)
+            //   E = cde_op(C, D0, D1, ...)
+            // Assume:
+            //   A[M0, M1, M2, ..., K0, K1, K2, ...]
+            //   B[N0, N1, N2, ..., K0, K1, K2, ...]
+            //   D[M0, M1, M2, ..., N0, N1, N2, ...]
+            //   E[M0, M1, M2, ..., N0, N1, N2, ...]
+            template <index_t NumDimM,
+                      index_t NumDimN,
+                      index_t NumDimK,
+                      typename ADataType,
+                      typename BDataType,
+                      typename AccDataType,
+                      typename CShuffleDataType,
+                      typename EDataType,
+                      typename AElementwiseOperation,
+                      typename BElementwiseOperation,
+                      GemmSpecialization GemmSpec,
+                      index_t            NumGemmKPrefetchStage,
+                      index_t            BlockSize,
+                      index_t            MPerBlock,
+                      index_t            NPerBlock,
+                      index_t            KPerBlock,
+                      index_t            AK1,
+                      index_t            BK1,
+                      index_t            MPerXDL,
+                      index_t            NPerXDL,
+                      index_t            MXdlPerWave,
+                      index_t            NXdlPerWave,
+                      typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                      typename ABlockTransferThreadClusterArrangeOrder,
+                      typename ABlockTransferSrcAccessOrder,
+                      index_t ABlockTransferSrcVectorDim,
+                      index_t ABlockTransferSrcScalarPerVector,
+                      index_t ABlockTransferDstScalarPerVector_AK1,
+                      bool    ABlockLdsExtraM,
+                      typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                      typename BBlockTransferThreadClusterArrangeOrder,
+                      typename BBlockTransferSrcAccessOrder,
+                      index_t BBlockTransferSrcVectorDim,
+                      index_t BBlockTransferSrcScalarPerVector,
+                      index_t BBlockTransferDstScalarPerVector_BK1,
+                      bool    BBlockLdsExtraN,
+                      index_t CShuffleMXdlPerWavePerShuffle,
+                      index_t CShuffleNXdlPerWavePerShuffle,
+                      typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                      index_t CDEBlockTransferScalarPerVector_NPerBlock,
+                      typename ComputeDataType,
+                      LoopScheduler LoopSched>
+            struct DeviceContractionMultipleD_Xdl_CShuffle<
+                NumDimM,
+                NumDimN,
+                NumDimK,
+                HIP_vector_type<ADataType, 2>,
+                HIP_vector_type<BDataType, 2>,
+                AccDataType,
+                CShuffleDataType,
+                ck::Tuple<>,
+                HIP_vector_type<EDataType, 2>,
+                AElementwiseOperation,
+                BElementwiseOperation,
+                Scale,
+                GemmSpec,
+                NumGemmKPrefetchStage,
+                BlockSize,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                AK1,
+                BK1,
+                MPerXDL,
+                NPerXDL,
+                MXdlPerWave,
+                NXdlPerWave,
+                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                ABlockTransferThreadClusterArrangeOrder,
+                ABlockTransferSrcAccessOrder,
+                ABlockTransferSrcVectorDim,
+                ABlockTransferSrcScalarPerVector,
+                ABlockTransferDstScalarPerVector_AK1,
+                ABlockLdsExtraM,
+                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                BBlockTransferThreadClusterArrangeOrder,
+                BBlockTransferSrcAccessOrder,
+                BBlockTransferSrcVectorDim,
+                BBlockTransferSrcScalarPerVector,
+                BBlockTransferDstScalarPerVector_BK1,
+                BBlockLdsExtraN,
+                CShuffleMXdlPerWavePerShuffle,
+                CShuffleNXdlPerWavePerShuffle,
+                CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                CDEBlockTransferScalarPerVector_NPerBlock,
+                ComputeDataType,
+                LoopSched>
+
+                : public DeviceContractionMultipleD<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    HIP_vector_type<ADataType, 2>,
+                                                    HIP_vector_type<BDataType, 2>,
+                                                    ck::Tuple<>,
+                                                    HIP_vector_type<EDataType, 2>,
+                                                    AElementwiseOperation,
+                                                    BElementwiseOperation,
+                                                    Scale,
+                                                    ComputeDataType>
+            {
+                // Complex device Op
+                using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
+
+                // CDE Operations
+                using ScaleCDEElementwiseOperation    = Scale;
+                using BilinearCDEElementwiseOperation = Bilinear;
+
+                // Complex types given through the interface
+                using ComplexA  = HIP_vector_type<ADataType, 2>;
+                using ComplexB  = HIP_vector_type<BDataType, 2>;
+                using ComplexDs = HIP_vector_type<EDataType, 2>;
+                using ComplexE  = HIP_vector_type<EDataType, 2>;
+
+                // Internal functional types we will use to
+                // decompose the complex types and operate on.
+                using DecompA  = ADataType;
+                using DecompB  = BDataType;
+                using DecompDs = EDataType;
+                using DecompE  = EDataType;
+
+                // For complex types, we need to make sure that all of the types are the same
+                static_assert(std::is_same_v<DecompA, DecompB> && std::is_same_v<DecompB, DecompE>
+                                  && std::is_same_v<DecompE, ComputeDataType>
+                                  && std::is_same_v<ComputeDataType, CShuffleDataType>,
+                              "Complex operations must have the same data type");
+
+                static_assert(std::is_same_v<DecompA, float> || std::is_same_v<DecompA, double>,
+                              "Complex operations only supported with single or double precision");
+
+                static constexpr index_t NumDTensor = 0;
+
+                // The internal operation that we will decompose the complex operations with.
+                // For complex will be either float or double
+                using ScaleDecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    DecompA,
+                    DecompB,
+                    AccDataType,
+                    CShuffleDataType,
+                    ck::Tuple<>,
+                    DecompE,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    ScaleCDEElementwiseOperation,
+                    GemmSpec,
+                    NumGemmKPrefetchStage,
+                    BlockSize,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    AK1,
+                    BK1,
+                    MPerXDL,
+                    NPerXDL,
+                    MXdlPerWave,
+                    NXdlPerWave,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    ABlockTransferSrcVectorDim,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    ABlockLdsExtraM,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    BBlockTransferSrcVectorDim,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    BBlockLdsExtraN,
+                    CShuffleMXdlPerWavePerShuffle,
+                    CShuffleNXdlPerWavePerShuffle,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    CDEBlockTransferScalarPerVector_NPerBlock,
+                    ComputeDataType,
+                    LoopSched>;
+
+                // The internal operation that we will decompose the complex operations with.
+                // For complex will be either float or double
+                using BilinearDecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    DecompA,
+                    DecompB,
+                    AccDataType,
+                    CShuffleDataType,
+                    ck::Tuple<DecompDs>,
+                    DecompE,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    BilinearCDEElementwiseOperation,
+                    GemmSpec,
+                    NumGemmKPrefetchStage,
+                    BlockSize,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    AK1,
+                    BK1,
+                    MPerXDL,
+                    NPerXDL,
+                    MXdlPerWave,
+                    NXdlPerWave,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    ABlockTransferSrcVectorDim,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    ABlockLdsExtraM,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    BBlockTransferSrcVectorDim,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    BBlockLdsExtraN,
+                    CShuffleMXdlPerWavePerShuffle,
+                    CShuffleNXdlPerWavePerShuffle,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    CDEBlockTransferScalarPerVector_NPerBlock,
+                    ComputeDataType,
+                    LoopSched>;
+
+                // Argument
+                struct Argument : public BaseArgument
+                {
+                    using ScaleDecompArgument    = typename ScaleDecompOp::Argument;
+                    using BilinearDecompArgument = typename BilinearDecompOp::Argument;
+
+                    Argument(Argument&& other)
+                        : mScaleArgs({std::move(other.mScaleArgs)})
+                        , mBilinearArgs({std::move(other.mBilinearArgs[0]),
+                                         std::move(other.mBilinearArgs[1]),
+                                         std::move(other.mBilinearArgs[2])})
+                    {
+                    }
+
+                    Argument& operator=(Argument&& other)
+                    {
+                        if(this != &other)
+                        {
+                            mScaleArgs       = std::move(other.mScaleArgs);
+                            mBilinearArgs[0] = std::move(other.mBilinearArgs[0]);
+                            mBilinearArgs[1] = std::move(other.mBilinearArgs[1]);
+                            mBilinearArgs[2] = std::move(other.mBilinearArgs[2]);
+                        }
+                        return *this;
+                    }
+
+                    Argument(const void*                                         p_a_grid,
+                             const void*                                         p_b_grid,
+                             std::array<const void*, NumDTensor>                 p_ds_grid,
+                             void*                                               p_e_grid,
+                             const std::vector<index_t>&                         a_ms_ks_lengths,
+                             const std::vector<index_t>&                         a_ms_ks_strides,
+                             const std::vector<index_t>&                         b_ns_ks_lengths,
+                             const std::vector<index_t>&                         b_ns_ks_strides,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                             const std::vector<index_t>&                         e_ms_ns_lengths,
+                             const std::vector<index_t>&                         e_ms_ns_strides,
+                             AElementwiseOperation                               a_element_op,
+                             BElementwiseOperation                               b_element_op,
+                             ScaleCDEElementwiseOperation                        cde_element_op)
+                    {
+                        // Take the incoming arguments, treat them as complex.
+
+                        // Allocate Real and Imaginary inputs
+                        auto elementsA
+                            = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides);
+                        auto elementsB
+                            = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides);
+                        auto elementsE
+                            = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
+
+                        mA_real.reset(nullptr);
+                        mA_imag.reset(nullptr);
+                        mB_real.reset(nullptr);
+                        mB_imag.reset(nullptr);
+                        mD_real.reset(nullptr);
+                        mD_imag.reset(nullptr);
+                        mE_real.reset(nullptr);
+                        mE_imag.reset(nullptr);
+
+                        auto blockDim = dim3(1024);
+
+                        auto decompGrid = [blockDim](auto&       out_r,
+                                                     auto&       out_i,
+                                                     auto const* input_grid,
+                                                     uint32_t    elementCount) {
+                            using DecompT = typename std::decay_t<decltype(out_r)>::element_type;
+                            static_assert(std::is_same_v<
+                                              DecompT,
+                                              typename std::decay_t<decltype(out_i)>::element_type>,
+                                          "r and i buffers must be same type");
+
+                            if(input_grid != nullptr)
+                            {
+                                out_r = std::move(allocDevice<DecompT>(elementCount));
+                                out_i = std::move(allocDevice<DecompT>(elementCount));
+
+                                auto gridDim = dim3(ceilDiv(elementCount, blockDim.x));
+                                hiptensor::unpack<<<gridDim, blockDim, 0>>>(
+                                    input_grid, out_r.get(), out_i.get(), elementCount);
+                            }
+                        };
+
+                        // Decompose the incoming data from AOS->SOA
+                        decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA);
+                        decompGrid(mB_real, mB_imag, (const ComplexA*)p_b_grid, elementsB);
+                        decompGrid(mE_real, mE_imag, (const ComplexA*)p_e_grid, elementsE);
+
+                        // Allocate extra space bilinear op.
+                        mD_real = std::move(allocDevice<DecompDs>(elementsE));
+                        mD_imag = std::move(allocDevice<DecompDs>(elementsE));
+
+                        auto allocScaleArgs = [a_ms_ks_lengths,
+                                               a_ms_ks_strides,
+                                               b_ns_ks_lengths,
+                                               b_ns_ks_strides,
+                                               ds_ms_ns_lengths,
+                                               ds_ms_ns_strides,
+                                               e_ms_ns_lengths,
+                                               e_ms_ns_strides,
+                                               a_element_op,
+                                               b_element_op](auto&       out_e,
+                                                             auto const& in_a,
+                                                             auto const& in_b,
+                                                             auto const& cde_element_op) {
+                            return std::make_unique<ScaleDecompArgument>(
+                                in_a.get(),
+                                in_b.get(),
+                                std::array<void const*, 0>{},
+                                out_e.get(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                ds_ms_ns_lengths,
+                                ds_ms_ns_strides,
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+                        };
+
+                        auto allocBilinearArgs = [a_ms_ks_lengths,
+                                                  a_ms_ks_strides,
+                                                  b_ns_ks_lengths,
+                                                  b_ns_ks_strides,
+                                                  ds_ms_ns_lengths,
+                                                  ds_ms_ns_strides,
+                                                  e_ms_ns_lengths,
+                                                  e_ms_ns_strides,
+                                                  a_element_op,
+                                                  b_element_op](auto&       out_e,
+                                                                auto const& in_a,
+                                                                auto const& in_b,
+                                                                auto const& in_d,
+                                                                auto const& cde_element_op) {
+                            return std::make_unique<BilinearDecompArgument>(
+                                in_a.get(),
+                                in_b.get(),
+                                std::array<void const*, 1>{in_d.get()},
+                                out_e.get(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                std::array<std::vector<index_t>, 1>{e_ms_ns_lengths},
+                                std::array<std::vector<index_t>, 1>{e_ms_ns_strides},
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+                        };
+
+                        // Not sure about these...
+                        mScaleArgs = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op);
+                        mBilinearArgs[0] = allocBilinearArgs(
+                            mE_real,
+                            mA_imag,
+                            mB_imag,
+                            mE_real,
+                            BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f});
+                        mBilinearArgs[1] = allocBilinearArgs(
+                            mE_imag,
+                            mA_real,
+                            mB_imag,
+                            mD_imag,
+                            BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});
+                        mBilinearArgs[2] = allocBilinearArgs(
+                            mE_imag,
+                            mA_imag,
+                            mB_real,
+                            mE_imag,
+                            BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});
+                    }
+
+                    void Print() const
+                    {
+                        std::cout << "ScaleArgs:" << std::endl;
+                        mScaleArgs->Print();
+                        std::cout << "BilinearArgs0:" << std::endl;
+                        mBilinearArgs[0]->Print();
+                        std::cout << "BilinearArgs1:" << std::endl;
+                        mBilinearArgs[1]->Print();
+                        std::cout << "BilinearArgs2:" << std::endl;
+                        mBilinearArgs[2]->Print();
+                    }
+
+                    //  private:
+                    // Each argument set for complex:
+                    std::unique_ptr<ScaleDecompArgument>    mScaleArgs;
+                    std::unique_ptr<BilinearDecompArgument> mBilinearArgs[3];
+
+                    template <typename DataT>
+                    using DeviceArray = std::unique_ptr<DataT, DeviceDeleter>;
+
+                    // Manage extra memory for AOS->SOA
+                    DeviceArray<DecompA>  mA_real;
+                    DeviceArray<DecompA>  mA_imag;
+                    DeviceArray<DecompB>  mB_real;
+                    DeviceArray<DecompB>  mB_imag;
+                    DeviceArray<DecompDs> mD_real;
+                    DeviceArray<DecompDs> mD_imag;
+                    DeviceArray<DecompE>  mE_real;
+                    DeviceArray<DecompE>  mE_imag;
+                };
+
+                // Invoker
+                struct Invoker : public BaseInvoker
+                {
+                    using Argument = typename DeviceOp::Argument;
+
+                    Invoker()
+                        : mScaleInvoker(std::make_unique<typename ScaleDecompOp::Invoker>())
+                        , mBilinearInvoker(std::make_unique<typename BilinearDecompOp::Invoker>())
+                    {
+                    }
+
+                    Invoker(Invoker&& other)
+                        : mScaleInvoker(std::move(other.mScaleInvoker))
+                        , mBilinearInvoker(std::move(other.mBilinearInvoker))
+                    {
+                    }
+
+                    Invoker& operator=(Invoker&& other)
+                    {
+                        if(this != &other)
+                        {
+                            mScaleInvoker    = std::move(other.mScaleInvoker);
+                            mBilinearInvoker = std::move(other.mBilinearInvoker);
+                        }
+                        return *this;
+                    }
+
+                    float Run(const Argument&     arg,
+                              const StreamConfig& stream_config = StreamConfig{})
+                    {
+                        auto r0 = mScaleInvoker->Run(arg.mScaleArgs.get(), stream_config);
+                        auto r1 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config);
+                        auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config);
+                        auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[2].get(), stream_config);
+
+                        // Reduce results?
+                        return r3;
+                    }
+
+                    // polymorphic
+                    float Run(const BaseArgument* p_arg,
+                              const StreamConfig& stream_config = StreamConfig{}) override
+                    {
+                        return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+                    }
+
+                    std::unique_ptr<typename ScaleDecompOp::Invoker>    mScaleInvoker;
+                    std::unique_ptr<typename BilinearDecompOp::Invoker> mBilinearInvoker;
+                };
+
+                static bool IsSupportedArgument(const Argument& arg)
+                {
+                    return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs.get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[2].get()));
+                }
+
+                // polymorphic
+                bool IsSupportedArgument(const BaseArgument* p_arg) override
+                {
+                    return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+                }
+
+                // polymorphic
+                virtual void SetWorkSpacePointer(BaseArgument*       p_arg,
+                                                 void*               p_workspace,
+                                                 StreamConfig const& s
+                                                 = StreamConfig{}) const override
+                {
+                    // Call the base, then fwd to each arg.
+                    this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s);
+                    auto* arg = dynamic_cast<Argument*>(p_arg);
+                    this->BaseOperator::SetWorkSpacePointer(arg->mScaleArgs.get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[1].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[2].get(), p_workspace, s);
+                }
+
+                static auto MakeArgument(
+                    const void*                                         p_a,
+                    const void*                                         p_b,
+                    std::array<const void*, NumDTensor>                 p_ds,
+                    void*                                               p_e,
+                    const std::vector<index_t>&                         a_ms_ks_lengths,
+                    const std::vector<index_t>&                         a_ms_ks_strides,
+                    const std::vector<index_t>&                         b_ns_ks_lengths,
+                    const std::vector<index_t>&                         b_ns_ks_strides,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                    const std::vector<index_t>&                         e_ms_ns_lengths,
+                    const std::vector<index_t>&                         e_ms_ns_strides,
+                    AElementwiseOperation                               a_element_op,
+                    BElementwiseOperation                               b_element_op,
+                    ScaleCDEElementwiseOperation                        cde_element_op)
+                {
+                    return Argument{p_a,
+                                    p_b,
+                                    p_ds,
+                                    p_e,
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    ds_ms_ns_lengths,
+                                    ds_ms_ns_strides,
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op};
+                }
+
+                static auto MakeInvoker()
+                {
+                    return Invoker{};
+                }
+
+                // polymorphic
+                std::unique_ptr<BaseArgument> MakeArgumentPointer(
+                    const void*                                         p_a,
+                    const void*                                         p_b,
+                    std::array<const void*, NumDTensor>                 p_ds,
+                    void*                                               p_e,
+                    const std::vector<index_t>&                         a_ms_ks_lengths,
+                    const std::vector<index_t>&                         a_ms_ks_strides,
+                    const std::vector<index_t>&                         b_ns_ks_lengths,
+                    const std::vector<index_t>&                         b_ns_ks_strides,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                    const std::vector<index_t>&                         e_ms_ns_lengths,
+                    const std::vector<index_t>&                         e_ms_ns_strides,
+                    AElementwiseOperation                               a_element_op,
+                    BElementwiseOperation                               b_element_op,
+                    ScaleCDEElementwiseOperation                        cde_element_op) override
+                {
+                    return std::make_unique<Argument>(p_a,
+                                                      p_b,
+                                                      p_ds,
+                                                      p_e,
+                                                      a_ms_ks_lengths,
+                                                      a_ms_ks_strides,
+                                                      b_ns_ks_lengths,
+                                                      b_ns_ks_strides,
+                                                      ds_ms_ns_lengths,
+                                                      ds_ms_ns_strides,
+                                                      e_ms_ns_lengths,
+                                                      e_ms_ns_strides,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      cde_element_op);
+                }
+
+                // polymorphic
+                std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+                {
+                    return std::make_unique<Invoker>(Invoker{});
+                }
+
+                // polymorphic
+                std::string GetTypeString() const override
+                {
+                    auto str = std::stringstream();
+
+                    // clang-format off
+        str << "DeviceContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+                    // clang-format on
+
+                    return str.str();
+                }
+            };
+
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
+
+#endif // HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP
+
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
new file mode 100644
index 00000000..dbec7ebe
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
@@ -0,0 +1,105 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather
+// than using default setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter
+// of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                using F32         = float;
+                using CF32        = hipFloatComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                template <ck::index_t... Is>
+                using S = ck::Sequence<Is...>;
+
+                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+                using Scale       = ck::tensor_operation::element_wise::Scale;
+
+                static constexpr auto GemmMNKPadding
+                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
+                // k/k/n are the fast changing dimension for A/B/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance
+                    = std::tuple<
+                        // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
+        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
+        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
+        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, F32           >,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, F32           >
+                        // clang-format on
+                        >;
+
+                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                           2,
+                                                                           2,
+                                                                           CF32,
+                                                                           CF32,
+                                                                           Empty_Tuple,
+                                                                           CF32,
+                                                                           PassThrough,
+                                                                           PassThrough,
+                                                                           Scale,
+                                                                           F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
+
diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp
index 38e9f186..09df158f 100644
--- a/library/src/data_types.cpp
+++ b/library/src/data_types.cpp
@@ -79,6 +79,14 @@ namespace hiptensor
         {
             return sizeof(uint64_t);
         }
+        else if(id == HIP_C_32F)
+        {
+            return sizeof(hipFloatComplex);
+        }
+        else if(id == HIP_C_64F)
+        {
+            return sizeof(hipDoubleComplex);
+        }
         else if(id == NONE_TYPE)
         {
             return 0;
@@ -102,11 +110,11 @@ namespace hiptensor
         {
             return HIPTENSOR_COMPUTE_16F;
         }
-        else if(hipType == HIP_R_32F)
+        else if(hipType == HIP_R_32F || hipType == HIP_C_32F)
         {
             return HIPTENSOR_COMPUTE_32F;
         }
-        else if(hipType == HIP_R_64F)
+        else if(hipType == HIP_R_64F || hipType == HIP_C_64F)
         {
             return HIPTENSOR_COMPUTE_64F;
         }
@@ -187,11 +195,11 @@ bool operator==(hipDataType hipType, hiptensorComputeType_t computeType)
     {
         return (computeType == HIPTENSOR_COMPUTE_16F);
     }
-    else if(hipType == HIP_R_32F)
+    else if(hipType == HIP_R_32F || hipType == HIP_C_32F)
     {
         return (computeType == HIPTENSOR_COMPUTE_32F);
     }
-    else if(hipType == HIP_R_64F)
+    else if(hipType == HIP_R_64F || hipType == HIP_C_64F)
     {
         return (computeType == HIPTENSOR_COMPUTE_64F);
     }
diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp
index 51af1f48..8d185758 100644
--- a/library/src/hiptensor.cpp
+++ b/library/src/hiptensor.cpp
@@ -153,7 +153,8 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t*     han
 
     if((lens == nullptr)
        || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F)
-           && (dataType != HIP_R_64F))
+           && (dataType != HIP_R_64F) && (dataType != HIP_C_32F)
+           && (dataType != HIP_C_64F))
        || unaryOp != HIPTENSOR_OP_IDENTITY)
     {
         auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
index 19ccca6c..97402fa3 100644
--- a/library/src/include/data_types.hpp
+++ b/library/src/include/data_types.hpp
@@ -31,6 +31,7 @@
 // Include order needs to be preserved
 #include <hip/library_types.h>
 #include <hip/hip_bfloat16.h>
+#include <hip/hip_complex.h>
 #include <hip/hip_fp16.h>
 #include <iostream>
 
diff --git a/library/src/include/data_types_impl.hpp b/library/src/include/data_types_impl.hpp
index 7df6d7d9..ef3e7c77 100644
--- a/library/src/include/data_types_impl.hpp
+++ b/library/src/include/data_types_impl.hpp
@@ -105,6 +105,18 @@ namespace hiptensor
         static constexpr auto value = HIP_R_64U;
     };
 
+    template <>
+    struct HipDataType<hipFloatComplex>
+    {
+        static constexpr auto value = HIP_C_32F;
+    };
+
+    template <>
+    struct HipDataType<hipDoubleComplex>
+    {
+        static constexpr auto value = HIP_C_64F;
+    };
+
     template <>
     struct HipDataType<NoneType>
     {
@@ -162,6 +174,14 @@ namespace hiptensor
         {
             return static_cast<T>(*(uint64_t*)value);
         }
+        else if constexpr(std::is_same_v<T,hipFloatComplex> && id == HIP_C_32F)
+        {
+            return static_cast<T>(*(hipFloatComplex*)value);
+        }
+        else if constexpr(std::is_same_v<T,hipDoubleComplex> && id == HIP_C_64F)
+        {
+            return static_cast<T>(*(hipDoubleComplex*)value);
+        }
         else
         {
 #if !NDEBUG
diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index 00393f1d..c51a2dbc 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -31,6 +31,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
     add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp)
     add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp)
     add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32 simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp)
     add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp)
     add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp)
     add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp)
@@ -38,6 +39,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
     add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp)
     add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp)
     add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_cf32_cf32_cf32_compute_f32 simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp)
     add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp)
     add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp)
 
diff --git a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp
new file mode 100644
index 00000000..25392592
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef hipFloatComplex ADataType;
+    typedef hipFloatComplex BDataType;
+    typedef hipFloatComplex CDataType;
+    typedef float floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_C_32F;
+    constexpr hipDataType            typeB       = HIP_C_32F;
+    constexpr hipDataType            typeC       = HIP_C_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     floatTypeCompute,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>();
+}
diff --git a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp
new file mode 100644
index 00000000..7fc5c3a3
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef hipFloatComplex ADataType;
+    typedef hipFloatComplex BDataType;
+    typedef hipFloatComplex DDataType;
+    typedef float    floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_C_32F;
+    constexpr hipDataType            typeB       = HIP_C_32F;
+    constexpr hipDataType            typeD       = HIP_C_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  floatTypeCompute,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>();
+}
diff --git a/test/00_unit/yaml_test.cpp b/test/00_unit/yaml_test.cpp
index 2efc6b6e..372fbbdd 100644
--- a/test/00_unit/yaml_test.cpp
+++ b/test/00_unit/yaml_test.cpp
@@ -79,9 +79,13 @@ int main(int argc, char* argv[])
     yee.mDataTypes    = {
         // clang-format off
                 {HIP_R_32F, HIP_R_32F, hiptensor::NONE_TYPE, HIP_R_32F, HIP_R_32F}, // scale F32
+                {HIP_C_32F, HIP_C_32F, hiptensor::NONE_TYPE, HIP_C_32F, HIP_C_32F}, // scale F32 Complex
                 {HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F}, // bilinear F32
+                {HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F}, // bilinear F32 Complex
                 {HIP_R_64F, HIP_R_64F, hiptensor::NONE_TYPE, HIP_R_64F, HIP_R_64F}, // scale F64
+                {HIP_C_64F, HIP_C_64F, hiptensor::NONE_TYPE, HIP_C_64F, HIP_C_64F}, // scale F64 Complex
                 {HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F}, // bilinear F64
+                {HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F}, // bilinear F64 Complex
         // clang-format on
     };
     yee.mAlgorithms
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 76cc3033..2059fd73 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -57,8 +57,8 @@ namespace hiptensor
     bool ContractionTest::checkDevice(hipDataType datatype) const
     {
         return (isF32Supported()
-                && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF))
-               || (isF64Supported() && datatype == HIP_R_64F);
+                && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF || datatype == HIP_C_32F))
+               || (isF64Supported() && (datatype == HIP_R_64F || datatype == HIP_C_64F));
     }
 
     bool ContractionTest::checkSizes() const
@@ -117,14 +117,18 @@ namespace hiptensor
         auto DDataType = testType[3];
 
         EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF)
-                    || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F));
+                    || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)
+                    || (ADataType == HIP_C_32F) || (ADataType == HIP_C_64F));
         EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF)
-                    || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F));
+                    || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)
+                    || (BDataType == HIP_C_32F) || (BDataType == HIP_C_64F));
         EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF)
                     || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F)
+                    || (CDataType == HIP_C_32F) || (CDataType == HIP_C_64F)
                     || (CDataType == NONE_TYPE));
         EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF)
-                    || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F));
+                    || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)
+                    || (DDataType == HIP_C_32F) || (DDataType == HIP_C_64F));
         EXPECT_TRUE(
             (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF)
             || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F));
@@ -290,6 +294,32 @@ namespace hiptensor
                                             elementsCD,
                                             std::numeric_limits<double>::signaling_NaN());
             }
+            else if(ADataType == HIP_C_32F && BDataType == HIP_C_32F && DDataType == HIP_C_32F)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceA().get(), elementsA);
+                fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceB().get(), elementsB);
+                if(CDataType == HIP_C_32F)
+                {
+                    fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceC().get(), elementsCD);
+                }
+                fillValLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceD().get(),
+                                            elementsCD,
+                                            std::numeric_limits<hipFloatComplex>::signaling_NaN());
+            }
+           else if(ADataType == HIP_C_64F && BDataType == HIP_C_64F && DDataType == HIP_C_64F)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceA().get(), elementsA);
+                fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceB().get(), elementsB);
+                if(CDataType == HIP_C_64F)
+                {
+                    fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceC().get(), elementsCD);
+                }
+                fillValLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceD().get(),
+                                            elementsCD,
+                                            std::numeric_limits<hipDoubleComplex>::signaling_NaN());
+            }
 
             resource->copyDeviceToHostAll(elementBytes);
 
@@ -446,7 +476,7 @@ namespace hiptensor
                     hiptensorPrintArrayElements<float>(stream, (float*)D.get(), elementsCD);
                     stream << std::endl;
                 }
-                else
+                else if(DDataType == HIP_R_64F)
                 {
                     stream << "Tensor A elements:\n";
                     hiptensorPrintArrayElements<double>(
@@ -467,6 +497,48 @@ namespace hiptensor
                     hiptensorPrintArrayElements<double>(stream, (double*)D.get(), elementsCD);
                     stream << std::endl;
                 }
+                else if(DDataType == HIP_C_32F)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<hipFloatComplex>(
+                        stream, (hipFloatComplex*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<hipFloatComplex>(
+                        stream, (hipFloatComplex*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<hipFloatComplex>(
+                        stream, (hipFloatComplex*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<hipFloatComplex>(stream, (hipFloatComplex*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
+                else if(DDataType == HIP_C_64F)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<hipDoubleComplex>(
+                        stream, (hipDoubleComplex*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<hipDoubleComplex>(
+                        stream, (hipDoubleComplex*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<hipDoubleComplex>(
+                        stream, (hipDoubleComplex*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<hipDoubleComplex>(stream, (hipDoubleComplex*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
             }
         }
     }
@@ -566,12 +638,12 @@ namespace hiptensor
                         (hip_bfloat16*)reference.get(),
                         elementsCD);
             }
-            else if(DDataType == HIP_R_32F)
+            else if(DDataType == HIP_R_32F || DDataType == HIP_C_32F)
             {
                 std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<float>(
                     (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD);
             }
-            else if(DDataType == HIP_R_64F)
+            else if(DDataType ==  HIP_R_64F || DDataType == HIP_C_64F)
             {
                 std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<double>(
                     (double*)resource->deviceD().get(), (double*)reference.get(), elementsCD);
diff --git a/test/device/common.hpp b/test/device/common.hpp
index 172e6953..392c74c9 100644
--- a/test/device/common.hpp
+++ b/test/device/common.hpp
@@ -72,8 +72,21 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed)
 
     if(index < elementSize)
     {
-        auto value  = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize;
-        data[index] = static_cast<DataType>(value);
+        if constexpr(std::is_same_v<DataType, hipFloatComplex>)
+        {
+            auto value  = (float(index / float(RAND_MAX) - 0.5) * 100) / elementSize;
+            data[index] = make_hipFloatComplex(value, value);
+        }
+        else if constexpr(std::is_same_v<DataType, hipDoubleComplex>)
+        {
+            auto value  = (double(index / double(RAND_MAX) - 0.5) * 100) / elementSize;
+            data[index] = make_hipDoubleComplex(value, value);
+        }
+        else
+        {
+            auto value  = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize;
+            data[index] = static_cast<DataType>(value);
+        }
     }
 }
 
diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp
index cd3eb46f..5c674045 100644
--- a/test/llvm/yaml_parser_config.cpp
+++ b/test/llvm/yaml_parser_config.cpp
@@ -113,6 +113,8 @@ namespace llvm
                 io.enumCase(value, "HIP_R_16BF", HIP_R_16BF);
                 io.enumCase(value, "HIP_R_32F", HIP_R_32F);
                 io.enumCase(value, "HIP_R_64F", HIP_R_64F);
+                io.enumCase(value, "HIP_C_32F", HIP_C_32F);
+                io.enumCase(value, "HIP_C_64F", HIP_C_64F);
                 io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE);
             }
         };

From be8c0b5566ada525a694dc44afafccff2dccb643 Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Tue, 12 Dec 2023 12:11:20 -0500
Subject: [PATCH 30/42] Add compute types

- Add compute types
- Add instance factory for scale
---
 .../contraction_solution_instances.cpp        | 12 ++--
 ..._c_shuffle_cf32_cf32_cf32_kkn_instance.cpp | 57 ++++++++++++++++++-
 2 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index 65ed8f34..51e31635 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -112,7 +112,8 @@ namespace hiptensor
                                           hipFloatComplex,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear>());
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
 
         // Bilinear f64
         registerSolutions(
@@ -151,7 +152,8 @@ namespace hiptensor
                                           hipDoubleComplex,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear>());
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          double>());
 
         // Scale bf16
         registerSolutions(
@@ -232,7 +234,8 @@ namespace hiptensor
                                           hipFloatComplex,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale>());
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
 
         // Scale f64
         registerSolutions(
@@ -271,7 +274,8 @@ namespace hiptensor
                                           hipDoubleComplex,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale>());
+                                          ck::tensor_operation::element_wise::Scale,
+                                          double>());
 
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
index dbec7ebe..e6d5d15d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
@@ -57,7 +57,7 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
                 // k/k/n are the fast changing dimension for A/B/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance
                     = std::tuple<
                         // clang-format off
         //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
@@ -80,7 +80,7 @@ namespace ck
                         // clang-format on
                         >;
 
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                            2,
                                                                            2,
@@ -95,9 +95,60 @@ namespace ck
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance{});
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance{});
                 }
 
+                // Contraction + Scale
+                template <index_t NumDimM,
+                        index_t NumDimN,
+                        index_t NumDimK,
+                        typename ADataType,
+                        typename BDataType,
+                        typename EDataType,
+                        typename ComputeDataType>
+                struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    HIP_vector_type<ADataType, 2>,
+                    HIP_vector_type<BDataType, 2>,
+                    ck::Tuple<>,
+                    HIP_vector_type<EDataType, 2>,
+                    ck::tensor_operation::element_wise::PassThrough,
+                    ck::tensor_operation::element_wise::PassThrough,
+                    ck::tensor_operation::element_wise::Scale,
+                    ComputeDataType>>
+                {
+                    using DeviceOp = DeviceContractionMultipleD<NumDimM,
+                                                                NumDimN,
+                                                                NumDimK,
+                                                                HIP_vector_type<ADataType, 2>,
+                                                                HIP_vector_type<BDataType, 2>,
+                                                                ck::Tuple<>,
+                                                                HIP_vector_type<EDataType, 2>,
+                                                                ck::tensor_operation::element_wise::PassThrough,
+                                                                ck::tensor_operation::element_wise::PassThrough,
+                                                                ck::tensor_operation::element_wise::Scale,
+                                                                ComputeDataType>;
+
+                    static auto GetInstances()
+                    {
+                        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+                        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
+                                    is_same_v<EDataType, float>)
+                        {
+                            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+                            {
+                                if constexpr(is_same_v<ComputeDataType, float>)
+                                {
+                                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
+                                        op_ptrs);
+                                }
+                            }
+                        }
+                    }
+                };
             } // namespace instance
         } // namespace device
     } // namespace tensor_operation

From f6e39c14fda8d2e2eafd4f38d1f29509e5474746 Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Wed, 13 Dec 2023 10:01:44 -0500
Subject: [PATCH 31/42] Modify instance factory

- Modify complex scale impl
- Modify pack func
---
 .../src/contraction/contraction_pack_util.hpp |   2 +-
 .../contraction_solution_instances.cpp        |   4 +
 library/src/contraction/device/CMakeLists.txt |  18 +-
 .../device_contraction_bilinear_complex.hpp   |  42 ++-
 ...ffle_cf32_cf32_cf32_cf32_kknn_instance.cpp |  51 ++--
 ...ffle_cf32_cf32_cf32_cf32_knnn_instance.cpp |  89 ++++++
 ...ffle_cf32_cf32_cf32_cf32_mknn_instance.cpp |  89 ++++++
 ...ffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp |  89 ++++++
 ...ffle_cf64_cf64_cf64_cf64_kknn_instance.cpp |  90 ++++++
 ...ffle_cf64_cf64_cf64_cf64_knnn_instance.cpp |  89 ++++++
 ...ffle_cf64_cf64_cf64_cf64_mknn_instance.cpp |  89 ++++++
 ...ffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp |  89 ++++++
 .../device_contraction_scale_complex.hpp      |  89 +++---
 ..._c_shuffle_cf32_cf32_cf32_kkn_instance.cpp | 129 +++------
 ..._c_shuffle_cf32_cf32_cf32_knn_instance.cpp |  89 ++++++
 ..._c_shuffle_cf32_cf32_cf32_mkn_instance.cpp |  89 ++++++
 ..._c_shuffle_cf32_cf32_cf32_mnn_instance.cpp |  89 ++++++
 ..._c_shuffle_cf64_cf64_cf64_kkn_instance.cpp |  91 +++++++
 ..._c_shuffle_cf64_cf64_cf64_knn_instance.cpp |  89 ++++++
 ..._c_shuffle_cf64_cf64_cf64_mkn_instance.cpp |  89 ++++++
 ..._c_shuffle_cf64_cf64_cf64_mnn_instance.cpp |  88 ++++++
 ...tensor_contraction_bilinear_instances.hpp} | 169 ++++++------
 ...hiptensor_contraction_scale_instances.hpp} | 257 ++++++++++--------
 23 files changed, 1626 insertions(+), 383 deletions(-)
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp
 create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp
 rename library/src/contraction/device/{hiptensor_contraction_bilinear.hpp => hiptensor_contraction_bilinear_instances.hpp} (75%)
 rename library/src/contraction/device/{hiptensor_contraction_scale.hpp => hiptensor_contraction_scale_instances.hpp} (63%)

diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp
index 49741547..f242f3ea 100644
--- a/library/src/contraction/contraction_pack_util.hpp
+++ b/library/src/contraction/contraction_pack_util.hpp
@@ -91,7 +91,7 @@ namespace hiptensor
     auto allocDevice(int64_t numElements)
     {
         T* data;
-        CHECK_HIP_ERROR(hipMalloc(&data, numElements));
+        CHECK_HIP_ERROR(hipMalloc(&data, numElements * sizeof(T)));
         return std::unique_ptr<T, DeviceDeleter>(data, DeviceDeleter());
     }
 
diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index 51e31635..2cec41bc 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -27,6 +27,10 @@
 #include "contraction_solution_instances.hpp"
 #include "contraction_solution.hpp"
 
+// Ensure access to
+#include "device/hiptensor_contraction_bilinear_instances.hpp"
+#include "device/hiptensor_contraction_scale_instances.hpp"
+
 namespace hiptensor
 {
     ContractionSolutionInstances::ContractionSolutionInstances()
diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt
index 17bff3ca..3ac03149 100644
--- a/library/src/contraction/device/CMakeLists.txt
+++ b/library/src/contraction/device/CMakeLists.txt
@@ -29,6 +29,14 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -45,7 +53,6 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
@@ -58,6 +65,14 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
@@ -74,7 +89,6 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
index 2ffc9559..d57c4fdf 100644
--- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -294,7 +294,7 @@ namespace ck
                             = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides);
                         auto elementsD = elementSpaceFromLengthsAndStrides(ds_ms_ns_lengths[0],
                                                                            ds_ms_ns_strides[0]);
-                        auto elementsE
+                        elementsE
                             = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
 
                         mA_real.reset(nullptr);
@@ -305,7 +305,10 @@ namespace ck
                         mD_imag.reset(nullptr);
                         mE_real.reset(nullptr);
                         mE_imag.reset(nullptr);
+                        mE_real_buf.reset(nullptr);
+                        mE_imag_buf.reset(nullptr);
 
+                        mE_grid = p_e_grid;
                         auto blockDim = dim3(1024);
 
                         auto decompGrid = [blockDim](auto&       out_r,
@@ -330,9 +333,13 @@ namespace ck
                         };
 
                         decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA);
-                        decompGrid(mB_real, mB_imag, (const ComplexA*)p_b_grid, elementsB);
-                        decompGrid(mD_real, mD_imag, (const ComplexA*)p_ds_grid[0], elementsD);
-                        decompGrid(mE_real, mE_imag, (const ComplexA*)p_e_grid, elementsE);
+                        decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB);
+                        decompGrid(mD_real, mD_imag, (const ComplexDs*)p_ds_grid[0], elementsD);
+                        decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE);
+
+                        // Allocate extra space for intermediate results.
+                        mE_real_buf = std::move(allocDevice<DecompE>(elementsE));
+                        mE_imag_buf = std::move(allocDevice<DecompE>(elementsE));
 
                         auto allocArgs = [a_ms_ks_lengths,
                                           a_ms_ks_strides,
@@ -366,15 +373,16 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op);
+                        mArgs[0] = allocArgs(mE_real_buf, mA_real, mB_real, mD_real, cde_element_op);
                         mArgs[1] = allocArgs(mE_real,
                                              mA_imag,
                                              mB_imag,
-                                             mE_real,
+                                             mE_real_buf,
                                              CDEElementwiseOperation{cde_element_op.alpha_ * -1.0f,
-                                                                     cde_element_op.beta_});
-                        mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op);
-                        mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, cde_element_op);
+                                                                     1.0f});
+                        mArgs[2] = allocArgs(mE_imag_buf, mA_real, mB_imag, mD_imag, cde_element_op);
+                        mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag_buf,
+                                             CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f});
                     }
 
                     void Print() const
@@ -405,6 +413,11 @@ namespace ck
                     DeviceArray<DecompDs> mD_imag;
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
+                    DeviceArray<DecompE>  mE_real_buf;
+                    DeviceArray<DecompE>  mE_imag_buf;
+
+                    void* mE_grid;
+                    index_t elementsE;
                 };
 
                 // Invoker
@@ -439,8 +452,15 @@ namespace ck
                         auto r2 = mInvoker->Run(arg.mArgs[2].get(), stream_config);
                         auto r3 = mInvoker->Run(arg.mArgs[3].get(), stream_config);
 
-                        // Reduce results?
-                        return r3;
+                        if(arg.mE_grid != nullptr)
+                        {
+                            auto blockDim = dim3(1024);
+                            auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x));
+                            hiptensor::pack<<<gridDim, blockDim, 0>>>(
+                                arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
+                        }
+
+                        return r0 + r1 + r2 + r3;
                     }
 
                     // polymorphic
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
index fce71e8f..03514f47 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
@@ -33,6 +33,12 @@
 #include "common.hpp"
 #include "device_contraction_bilinear_complex.hpp"
 
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
 namespace ck
 {
     namespace tensor_operation
@@ -45,39 +51,19 @@ namespace ck
                 using CF32       = hipFloatComplex;
                 using CF32_Tuple = ck::Tuple<CF32>;
 
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1]  D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
-        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
-        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4,            float>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32,           CF32,  float     , float            , CF32_Tuple, CF32    ,     PassThrough, PassThrough  , Bilinear        , GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4,            float>
-                        // clang-format on
-                        >;
+                    = device_contraction_kk_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     CF32_Tuple,
+                                                     CF32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance(
@@ -91,7 +77,7 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Bilinear,
-                                                                               float>>>& instances)
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
@@ -102,4 +88,3 @@ namespace ck
         } // namespace device
     } // namespace tensor_operation
 } // namespace ck
-
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp
new file mode 100644
index 00000000..bb1ccde5
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32        = float;
+                using CF32       = hipFloatComplex;
+                using CF32_Tuple = ck::Tuple<CF32>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance
+                    = device_contraction_kn_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     CF32_Tuple,
+                                                     CF32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp
new file mode 100644
index 00000000..2d47acc0
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32        = float;
+                using CF32       = hipFloatComplex;
+                using CF32_Tuple = ck::Tuple<CF32>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance
+                    = device_contraction_mk_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     CF32_Tuple,
+                                                     CF32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp
new file mode 100644
index 00000000..4c881c0a
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32        = float;
+                using CF32       = hipFloatComplex;
+                using CF32_Tuple = ck::Tuple<CF32>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance
+                    = device_contraction_mn_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     CF32_Tuple,
+                                                     CF32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp
new file mode 100644
index 00000000..ed2ba843
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather
+// than using default setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter
+// of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64        = double;
+                using CF64       = hipDoubleComplex;
+                using CF64_Tuple = ck::Tuple<CF64>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance
+                    = device_contraction_f64_kk_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         CF64_Tuple,
+                                                         CF64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp
new file mode 100644
index 00000000..03dd9293
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64        = double;
+                using CF64       = hipDoubleComplex;
+                using CF64_Tuple = ck::Tuple<CF64>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance
+                    = device_contraction_f64_kn_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         CF64_Tuple,
+                                                         CF64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp
new file mode 100644
index 00000000..c44a5daf
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64        = double;
+                using CF64       = hipDoubleComplex;
+                using CF64_Tuple = ck::Tuple<CF64>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance
+                    = device_contraction_f64_mk_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         CF64_Tuple,
+                                                         CF64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                   add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                              CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
+                                                                              PassThrough,
+                                                                               PassThrough,
+                                                                              Bilinear,
+                                                                               F64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp
new file mode 100644
index 00000000..d045a404
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64        = double;
+                using CF64       = hipDoubleComplex;
+                using CF64_Tuple = ck::Tuple<CF64>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance
+                    = device_contraction_f64_mn_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         CF64_Tuple,
+                                                         CF64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp
index 96531ddd..94f3b7f1 100644
--- a/library/src/contraction/device/device_contraction_scale_complex.hpp
+++ b/library/src/contraction/device/device_contraction_scale_complex.hpp
@@ -301,10 +301,10 @@ namespace ck
                     using BilinearDecompArgument = typename BilinearDecompOp::Argument;
 
                     Argument(Argument&& other)
-                        : mScaleArgs({std::move(other.mScaleArgs)})
+                        : mScaleArgs({std::move(other.mScaleArgs[0]),
+                                      std::move(other.mScaleArgs[1])})
                         , mBilinearArgs({std::move(other.mBilinearArgs[0]),
-                                         std::move(other.mBilinearArgs[1]),
-                                         std::move(other.mBilinearArgs[2])})
+                                         std::move(other.mBilinearArgs[1])})
                     {
                     }
 
@@ -312,10 +312,10 @@ namespace ck
                     {
                         if(this != &other)
                         {
-                            mScaleArgs       = std::move(other.mScaleArgs);
+                            mScaleArgs[0]       = std::move(other.mScaleArgs[0]);
+                            mScaleArgs[1]       = std::move(other.mScaleArgs[1]);
                             mBilinearArgs[0] = std::move(other.mBilinearArgs[0]);
                             mBilinearArgs[1] = std::move(other.mBilinearArgs[1]);
-                            mBilinearArgs[2] = std::move(other.mBilinearArgs[2]);
                         }
                         return *this;
                     }
@@ -350,11 +350,12 @@ namespace ck
                         mA_imag.reset(nullptr);
                         mB_real.reset(nullptr);
                         mB_imag.reset(nullptr);
-                        mD_real.reset(nullptr);
-                        mD_imag.reset(nullptr);
                         mE_real.reset(nullptr);
                         mE_imag.reset(nullptr);
+                        mE_real_buf.reset(nullptr);
+                        mE_imag_buf.reset(nullptr);
 
+                        mE_grid = p_e_grid;
                         auto blockDim = dim3(1024);
 
                         auto decompGrid = [blockDim](auto&       out_r,
@@ -380,12 +381,12 @@ namespace ck
 
                         // Decompose the incoming data from AOS->SOA
                         decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA);
-                        decompGrid(mB_real, mB_imag, (const ComplexA*)p_b_grid, elementsB);
-                        decompGrid(mE_real, mE_imag, (const ComplexA*)p_e_grid, elementsE);
+                        decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB);
+                        decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE);
 
-                        // Allocate extra space bilinear op.
-                        mD_real = std::move(allocDevice<DecompDs>(elementsE));
-                        mD_imag = std::move(allocDevice<DecompDs>(elementsE));
+                        // Allocate extra space for intermediate results to bilinear op.
+                        mE_real_buf = std::move(allocDevice<DecompE>(elementsE));
+                        mE_imag_buf = std::move(allocDevice<DecompE>(elementsE));
 
                         auto allocScaleArgs = [a_ms_ks_lengths,
                                                a_ms_ks_strides,
@@ -450,44 +451,38 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        // Not sure about these...
-                        mScaleArgs = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op);
+                        mScaleArgs[0] = allocScaleArgs(mE_real_buf, mA_real, mB_real, cde_element_op);
+                        mScaleArgs[1] = allocScaleArgs(mE_imag_buf, mA_real, mB_imag, cde_element_op);
                         mBilinearArgs[0] = allocBilinearArgs(
                             mE_real,
                             mA_imag,
                             mB_imag,
-                            mE_real,
+                            mE_real_buf,
                             BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f});
                         mBilinearArgs[1] = allocBilinearArgs(
-                            mE_imag,
-                            mA_real,
-                            mB_imag,
-                            mD_imag,
-                            BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});
-                        mBilinearArgs[2] = allocBilinearArgs(
                             mE_imag,
                             mA_imag,
                             mB_real,
-                            mE_imag,
+                            mE_imag_buf,
                             BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});
                     }
 
                     void Print() const
                     {
-                        std::cout << "ScaleArgs:" << std::endl;
-                        mScaleArgs->Print();
+                        std::cout << "ScaleArgs0:" << std::endl;
+                        mScaleArgs[0]->Print();
+                        std::cout << "ScaleArgs1:" << std::endl;
+                        mScaleArgs[1]->Print();
                         std::cout << "BilinearArgs0:" << std::endl;
                         mBilinearArgs[0]->Print();
                         std::cout << "BilinearArgs1:" << std::endl;
                         mBilinearArgs[1]->Print();
-                        std::cout << "BilinearArgs2:" << std::endl;
-                        mBilinearArgs[2]->Print();
                     }
 
                     //  private:
                     // Each argument set for complex:
-                    std::unique_ptr<ScaleDecompArgument>    mScaleArgs;
-                    std::unique_ptr<BilinearDecompArgument> mBilinearArgs[3];
+                    std::unique_ptr<ScaleDecompArgument>    mScaleArgs[2];
+                    std::unique_ptr<BilinearDecompArgument> mBilinearArgs[2];
 
                     template <typename DataT>
                     using DeviceArray = std::unique_ptr<DataT, DeviceDeleter>;
@@ -497,10 +492,13 @@ namespace ck
                     DeviceArray<DecompA>  mA_imag;
                     DeviceArray<DecompB>  mB_real;
                     DeviceArray<DecompB>  mB_imag;
-                    DeviceArray<DecompDs> mD_real;
-                    DeviceArray<DecompDs> mD_imag;
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
+                    DeviceArray<DecompE>  mE_real_buf;
+                    DeviceArray<DecompE>  mE_imag_buf;
+
+                    void* mE_grid;
+                    index_t elementsE;
                 };
 
                 // Invoker
@@ -533,13 +531,20 @@ namespace ck
                     float Run(const Argument&     arg,
                               const StreamConfig& stream_config = StreamConfig{})
                     {
-                        auto r0 = mScaleInvoker->Run(arg.mScaleArgs.get(), stream_config);
-                        auto r1 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config);
-                        auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config);
-                        auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[2].get(), stream_config);
+                        auto r0 = mScaleInvoker->Run(arg.mScaleArgs[0].get(), stream_config);
+                        auto r1 = mScaleInvoker->Run(arg.mScaleArgs[1].get(), stream_config);
+                        auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config);
+                        auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config);
 
-                        // Reduce results?
-                        return r3;
+                        if(arg.mE_grid != nullptr)
+                        {
+                            auto blockDim = dim3(1024);
+                            auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x));
+                            hiptensor::pack<<<gridDim, blockDim, 0>>>(
+                                arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
+                        }
+
+                        return r0 + r1 + r2 + r3;
                     }
 
                     // polymorphic
@@ -555,10 +560,10 @@ namespace ck
 
                 static bool IsSupportedArgument(const Argument& arg)
                 {
-                    return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs.get()))
+                    return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[0].get()))
+                           && ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[1].get()))
                            && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get()))
-                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get()))
-                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[2].get()));
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get()));
                 }
 
                 // polymorphic
@@ -576,13 +581,13 @@ namespace ck
                     // Call the base, then fwd to each arg.
                     this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s);
                     auto* arg = dynamic_cast<Argument*>(p_arg);
-                    this->BaseOperator::SetWorkSpacePointer(arg->mScaleArgs.get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(arg->mScaleArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mScaleArgs[1].get(), p_workspace, s);
                     this->BaseOperator::SetWorkSpacePointer(
                         arg->mBilinearArgs[0].get(), p_workspace, s);
                     this->BaseOperator::SetWorkSpacePointer(
                         arg->mBilinearArgs[1].get(), p_workspace, s);
-                    this->BaseOperator::SetWorkSpacePointer(
-                        arg->mBilinearArgs[2].get(), p_workspace, s);
                 }
 
                 static auto MakeArgument(
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
index e6d5d15d..3352556d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
@@ -33,6 +33,12 @@
 #include "common.hpp"
 #include "device_contraction_scale_complex.hpp"
 
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
 namespace ck
 {
     namespace tensor_operation
@@ -46,109 +52,38 @@ namespace ck
                 using CF32        = hipFloatComplex;
                 using Empty_Tuple = ck::Tuple<>;
 
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // k/k/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK|      AData|     BData|     AccData|         CShuffle|     DsData|     EData|              A|              B|              CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|         Compute|
-        //#####################################|        |        |        |       Type|      Type|        Type|         DataType|       Type|      Type|    Elementwise|    Elementwise|      Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            Data|
-        //#####################################|        |        |        |           |          |            |                 |           |          |      Operation|      Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            Type|
-        //#####################################|        |        |        |           |          |            |                 |           |          |               |               |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, F32           >,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,  CF32     , CF32      , F32       , F32             , Empty_Tuple, CF32    , PassThrough   , PassThrough   , Scale           , GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4, F32           >
-                        // clang-format on
-                        >;
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           CF32,
-                                                                           CF32,
-                                                                           Empty_Tuple,
-                                                                           CF32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale,
-                                                                           F32>>>& instances)
-                {
+                    = device_contraction_kk_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     CF32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                    {
                     add_device_operation_instances(
                         instances,
                         device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance{});
                 }
 
-                // Contraction + Scale
-                template <index_t NumDimM,
-                        index_t NumDimN,
-                        index_t NumDimK,
-                        typename ADataType,
-                        typename BDataType,
-                        typename EDataType,
-                        typename ComputeDataType>
-                struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
-                    NumDimM,
-                    NumDimN,
-                    NumDimK,
-                    HIP_vector_type<ADataType, 2>,
-                    HIP_vector_type<BDataType, 2>,
-                    ck::Tuple<>,
-                    HIP_vector_type<EDataType, 2>,
-                    ck::tensor_operation::element_wise::PassThrough,
-                    ck::tensor_operation::element_wise::PassThrough,
-                    ck::tensor_operation::element_wise::Scale,
-                    ComputeDataType>>
-                {
-                    using DeviceOp = DeviceContractionMultipleD<NumDimM,
-                                                                NumDimN,
-                                                                NumDimK,
-                                                                HIP_vector_type<ADataType, 2>,
-                                                                HIP_vector_type<BDataType, 2>,
-                                                                ck::Tuple<>,
-                                                                HIP_vector_type<EDataType, 2>,
-                                                                ck::tensor_operation::element_wise::PassThrough,
-                                                                ck::tensor_operation::element_wise::PassThrough,
-                                                                ck::tensor_operation::element_wise::Scale,
-                                                                ComputeDataType>;
-
-                    static auto GetInstances()
-                    {
-                        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
-                        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                                    is_same_v<EDataType, float>)
-                        {
-                            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
-                            {
-                                if constexpr(is_same_v<ComputeDataType, float>)
-                                {
-                                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
-                                        op_ptrs);
-                                }
-                            }
-                        }
-                    }
-                };
             } // namespace instance
         } // namespace device
     } // namespace tensor_operation
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp
new file mode 100644
index 00000000..cfd6c7f4
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32         = float;
+                using CF32        = hipFloatComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance
+                    = device_contraction_kn_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     CF32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+   } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp
new file mode 100644
index 00000000..eacc1148
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32         = float;
+                using CF32        = hipFloatComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance
+                    = device_contraction_mk_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                    F32,
+                                                     Empty_Tuple,
+                                                     CF32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp
new file mode 100644
index 00000000..b5e79372
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32         = float;
+                using CF32        = hipFloatComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance
+                    = device_contraction_mn_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     CF32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp
new file mode 100644
index 00000000..c0934498
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather
+// than using default setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter
+// of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                using F64         = double;
+                using CF64        = hipDoubleComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance
+                    = device_contraction_f64_kk_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         CF64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance{});
+                }
+
+           } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp
new file mode 100644
index 00000000..8514cb70
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64         = double;
+                using CF64        = hipDoubleComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance
+                    = device_contraction_f64_kn_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         CF64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                       device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp
new file mode 100644
index 00000000..09d589d6
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64         = double;
+                using CF64        = hipDoubleComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance
+                    = device_contraction_f64_mk_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         CF64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp
new file mode 100644
index 00000000..6b90050b
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp
@@ -0,0 +1,88 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64         = double;
+                using CF64        = hipDoubleComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance
+                    = device_contraction_f64_mn_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         CF64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance{});
+                }
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/hiptensor_contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
similarity index 75%
rename from library/src/contraction/device/hiptensor_contraction_bilinear.hpp
rename to library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
index e8f73b58..eac0f117 100644
--- a/library/src/contraction/device/hiptensor_contraction_bilinear.hpp
+++ b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
@@ -37,120 +37,126 @@ namespace ck
         {
             namespace instance
             {
-
-                // float
+                using F32        = float;
+                using CF32       = hipFloatComplex;
+                using CF32_Tuple = ck::Tuple<CF32>;
+
+                using F64        = double;
+                using CF64       = hipDoubleComplex;
+                using CF64_Tuple = ck::Tuple<CF64>;
+ 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F32,
-                                                                               F32,
-                                                                               F32_Tuple,
-                                                                               F32,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               Bilinear,
+                                                                               F32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F32,
-                                                                               F32,
-                                                                               F32_Tuple,
-                                                                               F32,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               Bilinear,
+                                                                               F32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F32,
-                                                                               F32,
-                                                                               F32_Tuple,
-                                                                               F32,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               Bilinear,
+                                                                               F32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F32,
-                                                                               F32,
-                                                                               F32_Tuple,
-                                                                               F32,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               Bilinear,
+                                                                               F32>>>& instances);
 
                 // double
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F64,
-                                                                               F64,
-                                                                               F64_Tuple,
-                                                                               F64,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               Bilinear,
+                                                                               F64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F64,
-                                                                               F64,
-                                                                               F64_Tuple,
-                                                                               F64,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               Bilinear,
+                                                                               F64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F64,
-                                                                               F64,
-                                                                               F64_Tuple,
-                                                                               F64,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               Bilinear,
+                                                                               F64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F64,
-                                                                               F64,
-                                                                               F64_Tuple,
-                                                                               F64,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               Bilinear,
+                                                                               F64>>>& instances);
 
                 // Contraction + Bilinear
                 template <index_t NumDimM,
@@ -158,66 +164,69 @@ namespace ck
                           index_t NumDimK,
                           typename ADataType,
                           typename BDataType,
-                          typename DDataType,
-                          typename EDataType>
+                          typename DsDataType,
+                          typename EDataType,
+                          typename ComputeDataT>
                 struct DeviceOperationInstanceFactory<
                     ck::tensor_operation::device::DeviceContractionMultipleD<
                         NumDimM,
                         NumDimN,
                         NumDimK,
-                        ADataType,
-                        BDataType,
-                        ck::Tuple<DDataType>,
-                        EDataType,
+                        HIP_vector_type<ADataType, 2>,
+                        HIP_vector_type<BDataType, 2>,
+                        ck::Tuple<HIP_vector_type<DsDataType, 2>>,
+                        HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Bilinear>>
+                        ck::tensor_operation::element_wise::Bilinear,
+                        ComputeDataT>>
                 {
                     using DeviceOp = DeviceContractionMultipleD<
                         NumDimM,
                         NumDimN,
                         NumDimK,
-                        ADataType,
-                        BDataType,
-                        ck::Tuple<DDataType>,
-                        EDataType,
+                        HIP_vector_type<ADataType, 2>,
+                        HIP_vector_type<BDataType, 2>,
+                        ck::Tuple<HIP_vector_type<DsDataType, 2>>,
+                        HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Bilinear>;
+                        ck::tensor_operation::element_wise::Bilinear,
+                        ComputeDataT>;
 
                     static auto GetInstances()
                     {
                         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
                         if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float>
-                                     && is_same_v<DDataType, float> && is_same_v<EDataType, float>)
+                                     && is_same_v<DsDataType, float> && is_same_v<EDataType, float>)
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance(
                                     op_ptrs);
                             }
                         }
 
                         if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double>
-                                     && is_same_v<DDataType, double>
+                                     && is_same_v<DsDataType, double>
                                      && is_same_v<EDataType, double>)
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance(
                                     op_ptrs);
                             }
                         }
diff --git a/library/src/contraction/device/hiptensor_contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
similarity index 63%
rename from library/src/contraction/device/hiptensor_contraction_scale.hpp
rename to library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
index 916f79de..fff9dca6 100644
--- a/library/src/contraction/device/hiptensor_contraction_scale.hpp
+++ b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
@@ -37,136 +37,159 @@ namespace ck
         {
             namespace instance
             {
-
-                // float
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F32,
-                                                                           F32,
-                                                                           Empty_Tuple,
-                                                                           F32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F32,
-                                                                           F32,
-                                                                           Empty_Tuple,
-                                                                           F32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F32,
-                                                                           F32,
-                                                                           Empty_Tuple,
-                                                                           F32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F32,
-                                                                           F32,
-                                                                           Empty_Tuple,
-                                                                           F32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                // double
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F64,
-                                                                           F64,
-                                                                           Empty_Tuple,
-                                                                           F64,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F64,
-                                                                           F64,
-                                                                           Empty_Tuple,
-                                                                           F64,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F64,
-                                                                           F64,
-                                                                           Empty_Tuple,
-                                                                           F64,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F64,
-                                                                           F64,
-                                                                           Empty_Tuple,
-                                                                           F64,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
+                using F32         = float;
+                using CF32        = hipFloatComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                using F64  = double;
+                using CF64 = hipDoubleComplex;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F64>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F64>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F64>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F64>>>& instances);
+ 
                 // Contraction + Scale
                 template <index_t NumDimM,
                           index_t NumDimN,
                           index_t NumDimK,
                           typename ADataType,
                           typename BDataType,
-                          typename EDataType>
-                struct HipTensorDeviceOperationInstanceFactory<
+                          typename EDataType,
+                          typename ComputeDataType>
+                struct DeviceOperationInstanceFactory<
                     ck::tensor_operation::device::DeviceContractionMultipleD<
                         NumDimM,
                         NumDimN,
                         NumDimK,
-                        ADataType,
-                        BDataType,
+                        HIP_vector_type<ADataType, 2>,
+                        HIP_vector_type<BDataType, 2>,
                         ck::Tuple<>,
-                        EDataType,
+                        HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Scale>>
+                        ck::tensor_operation::element_wise::Scale,
+                        ComputeDataType>>
                 {
                     using DeviceOp = DeviceContractionMultipleD<
                         NumDimM,
                         NumDimN,
                         NumDimK,
-                        ADataType,
-                        BDataType,
+                        HIP_vector_type<ADataType, 2>,
+                        HIP_vector_type<BDataType, 2>,
                         ck::Tuple<>,
-                        EDataType,
+                        HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Scale>;
+                        ck::tensor_operation::element_wise::Scale,
+                        ComputeDataType>;
 
                     static auto GetInstances()
                     {
@@ -177,13 +200,13 @@ namespace ck
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance(
                                     op_ptrs);
                             }
                         }
@@ -193,13 +216,13 @@ namespace ck
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance(
                                     op_ptrs);
                             }
                         }

From 075cdfa071ea33406ebc307cae85e3b05228599a Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Fri, 15 Dec 2023 02:11:27 -0500
Subject: [PATCH 32/42] Enable complex f32

- Remove intermediate buffers
---
 .../device_contraction_bilinear_complex.hpp    | 16 ++++------------
 .../device_contraction_scale_complex.hpp       | 18 +++++-------------
 .../configs/bilinear_test_params.yaml          | 16 +++++++++-------
 .../configs/scale_test_params.yaml             |  2 ++
 4 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
index d57c4fdf..2df240c4 100644
--- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -305,8 +305,6 @@ namespace ck
                         mD_imag.reset(nullptr);
                         mE_real.reset(nullptr);
                         mE_imag.reset(nullptr);
-                        mE_real_buf.reset(nullptr);
-                        mE_imag_buf.reset(nullptr);
 
                         mE_grid = p_e_grid;
                         auto blockDim = dim3(1024);
@@ -337,10 +335,6 @@ namespace ck
                         decompGrid(mD_real, mD_imag, (const ComplexDs*)p_ds_grid[0], elementsD);
                         decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE);
 
-                        // Allocate extra space for intermediate results.
-                        mE_real_buf = std::move(allocDevice<DecompE>(elementsE));
-                        mE_imag_buf = std::move(allocDevice<DecompE>(elementsE));
-
                         auto allocArgs = [a_ms_ks_lengths,
                                           a_ms_ks_strides,
                                           b_ns_ks_lengths,
@@ -373,15 +367,15 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        mArgs[0] = allocArgs(mE_real_buf, mA_real, mB_real, mD_real, cde_element_op);
+                        mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op);
                         mArgs[1] = allocArgs(mE_real,
                                              mA_imag,
                                              mB_imag,
-                                             mE_real_buf,
+                                             mE_real,
                                              CDEElementwiseOperation{cde_element_op.alpha_ * -1.0f,
                                                                      1.0f});
-                        mArgs[2] = allocArgs(mE_imag_buf, mA_real, mB_imag, mD_imag, cde_element_op);
-                        mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag_buf,
+                        mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op);
+                        mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag,
                                              CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f});
                     }
 
@@ -413,8 +407,6 @@ namespace ck
                     DeviceArray<DecompDs> mD_imag;
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
-                    DeviceArray<DecompE>  mE_real_buf;
-                    DeviceArray<DecompE>  mE_imag_buf;
 
                     void* mE_grid;
                     index_t elementsE;
diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp
index 94f3b7f1..f3a7fd2e 100644
--- a/library/src/contraction/device/device_contraction_scale_complex.hpp
+++ b/library/src/contraction/device/device_contraction_scale_complex.hpp
@@ -343,7 +343,7 @@ namespace ck
                             = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides);
                         auto elementsB
                             = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides);
-                        auto elementsE
+                        elementsE
                             = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
 
                         mA_real.reset(nullptr);
@@ -352,8 +352,6 @@ namespace ck
                         mB_imag.reset(nullptr);
                         mE_real.reset(nullptr);
                         mE_imag.reset(nullptr);
-                        mE_real_buf.reset(nullptr);
-                        mE_imag_buf.reset(nullptr);
 
                         mE_grid = p_e_grid;
                         auto blockDim = dim3(1024);
@@ -384,10 +382,6 @@ namespace ck
                         decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB);
                         decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE);
 
-                        // Allocate extra space for intermediate results to bilinear op.
-                        mE_real_buf = std::move(allocDevice<DecompE>(elementsE));
-                        mE_imag_buf = std::move(allocDevice<DecompE>(elementsE));
-
                         auto allocScaleArgs = [a_ms_ks_lengths,
                                                a_ms_ks_strides,
                                                b_ns_ks_lengths,
@@ -451,19 +445,19 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        mScaleArgs[0] = allocScaleArgs(mE_real_buf, mA_real, mB_real, cde_element_op);
-                        mScaleArgs[1] = allocScaleArgs(mE_imag_buf, mA_real, mB_imag, cde_element_op);
+                        mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op);
+                        mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op);
                         mBilinearArgs[0] = allocBilinearArgs(
                             mE_real,
                             mA_imag,
                             mB_imag,
-                            mE_real_buf,
+                            mE_real,
                             BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f});
                         mBilinearArgs[1] = allocBilinearArgs(
                             mE_imag,
                             mA_imag,
                             mB_real,
-                            mE_imag_buf,
+                            mE_imag,
                             BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});
                     }
 
@@ -494,8 +488,6 @@ namespace ck
                     DeviceArray<DecompB>  mB_imag;
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
-                    DeviceArray<DecompE>  mE_real_buf;
-                    DeviceArray<DecompE>  mE_imag_buf;
 
                     void* mE_grid;
                     index_t elementsE;
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index f4be1a88..cbaee86a 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -1,13 +1,15 @@
 ---
 Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
-  - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
-  - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
-  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ]
-  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ]
-  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ]
-  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
-  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ]
+  - [ HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_32F ]
+  - [ HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ]
+  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ]
+  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ]
+  - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_R_32F ]
+  - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_R_64F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index f4be1a88..4e640034 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -8,6 +8,8 @@ Tensor Data Types:
   - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ]
+  - [ HIP_C_32F, HIP_C_32F, NONE_TYPE, HIP_C_32F, HIP_R_32F ]
+  - [ HIP_C_64F, HIP_C_64F, NONE_TYPE, HIP_C_64F, HIP_R_64F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT

From 1bf4a550b2c700901774e68720db66a8b47a21df Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Wed, 20 Dec 2023 12:39:08 -0500
Subject: [PATCH 33/42] Add scalar and complex type instances

- Add cpu instances
- Add scale complex initial imp
- Remove compute f32 instances for complex f32
- Modify multiply utility
- Modified bilinear to take complex compute
---
 .../contraction_cpu_reference_instances.cpp   | 56 +++++++++++++++
 .../src/contraction/contraction_pack_util.hpp | 50 ++++++++++++-
 library/src/contraction/device/CMakeLists.txt | 32 ++++-----
 .../device_contraction_bilinear_complex.hpp   | 50 +++++++++----
 ..._cf32_cf32_compute_cf32_kknn_instance.cpp} | 10 +--
 ..._cf32_cf32_compute_cf32_knnn_instance.cpp} | 10 +--
 ..._cf32_cf32_compute_cf32_mknn_instance.cpp} | 10 +--
 ..._cf32_cf32_compute_cf32_mnnn_instance.cpp} | 10 +--
 ..._cf64_cf64_compute_cf64_kknn_instance.cpp} | 10 +--
 ..._cf64_cf64_compute_cf64_knnn_instance.cpp} | 10 +--
 ..._cf64_cf64_compute_cf64_mknn_instance.cpp} | 16 ++---
 ..._cf64_cf64_compute_cf64_mnnn_instance.cpp} | 10 +--
 .../device_contraction_scale_complex.hpp      | 71 +++++++++++++------
 ...2_cf32_cf32_compute_cf32_kkn_instance.cpp} | 10 +--
 ...2_cf32_cf32_compute_cf32_knn_instance.cpp} | 10 +--
 ...2_cf32_cf32_compute_cf32_mkn_instance.cpp} | 12 ++--
 ...2_cf32_cf32_compute_cf32_mnn_instance.cpp} | 10 +--
 ...4_cf64_cf64_compute_cf64_kkn_instance.cpp} | 10 +--
 ...4_cf64_cf64_compute_cf64_knn_instance.cpp} | 10 +--
 ...4_cf64_cf64_compute_cf64_mkn_instance.cpp} | 10 +--
 ...4_cf64_cf64_compute_cf64_mnn_instance.cpp} | 10 +--
 21 files changed, 290 insertions(+), 137 deletions(-)
 rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp} (92%)
 rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp} (94%)
 rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp} (94%)

diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 68b4ad1b..d2fd77fa 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -120,6 +120,20 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::Bilinear,
                                         float>());
 
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        float,
+                                        ck::Tuple<hipFloatComplex>,
+                                        hipFloatComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        hipFloatComplex>());
+
         // Bilinear f64
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -164,6 +178,20 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::Bilinear,
                                         double>());
 
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        double,
+                                        ck::Tuple<hipDoubleComplex>,
+                                        hipDoubleComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        hipDoubleComplex>());
+
         // Scale f16
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -252,6 +280,20 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::Scale,
                                         float>());
 
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        float,
+                                        ck::Tuple<>,
+                                        hipFloatComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        hipFloatComplex>());
+
         // Scale f64
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -295,5 +337,19 @@ namespace hiptensor
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::Scale,
                                         double>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        double,
+                                        ck::Tuple<>,
+                                        hipDoubleComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        hipDoubleComplex>());
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp
index f242f3ea..bcc99398 100644
--- a/library/src/contraction/contraction_pack_util.hpp
+++ b/library/src/contraction/contraction_pack_util.hpp
@@ -33,6 +33,55 @@
 
 namespace hiptensor
 {
+    /**
+     * \brief This function performs multiply-accumulate of the form E = accum * alpha + D * beta
+     *
+     */
+    template <typename DataType>
+    __global__ void mfma(DataType* mE_real, DataType* mE_imag, DataType* mD_real, DataType* mD_imag,
+                         HIP_vector_type<DataType, 2> *mE_grid, HIP_vector_type<DataType, 2> alpha,
+                         HIP_vector_type<DataType, 2> beta, int length)
+    {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(idx < length)
+        {
+            if constexpr(std::is_same_v<DataType, float>)
+            {
+                mE_grid[idx] = hipCaddf(hipCmulf(make_hipFloatComplex(mE_real[idx], mE_imag[idx]), alpha),
+                                        hipCmulf(make_hipFloatComplex(mD_real[idx], mD_imag[idx]), beta));
+            }
+            else if constexpr(std::is_same_v<DataType, double>)
+            {
+                mE_grid[idx] = hipCadd(hipCmul(make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), alpha),
+                                       hipCmul(make_hipDoubleComplex(mD_real[idx], mD_imag[idx]), beta));
+           }
+        }
+    }
+
+    /**
+     * \brief This function performs multiply of the form C = accum * alpha
+     *
+     */
+    template <typename DataType>
+    __global__ void multiply(DataType* mE_real, DataType* mE_imag, HIP_vector_type<DataType, 2> *mE_grid,
+                             HIP_vector_type<DataType, 2> alpha, int length)
+    {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(idx < length)
+        {
+            if constexpr(std::is_same_v<DataType, float>)
+            {
+                mE_grid[idx] = hipCmulf(make_hipFloatComplex(mE_real[idx], mE_imag[idx]), alpha);
+            }
+            else if constexpr(std::is_same_v<DataType, double>)
+            {
+                mE_grid[idx] = hipCmul(make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), alpha);
+           }
+        }
+    }
+
     /**
      * \brief This function unpacks structured data (hipFloatComplex / hipDoubleComplex)
      *        into non-structured data (float / double).
@@ -98,4 +147,3 @@ namespace hiptensor
 } // namespace hiptensor
 
 #endif // HIPTENSOR_CONTRACTION_PACK_UTIL_HPP
-
diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt
index 3ac03149..b65a8ab1 100644
--- a/library/src/contraction/device/CMakeLists.txt
+++ b/library/src/contraction/device/CMakeLists.txt
@@ -29,14 +29,14 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -65,14 +65,14 @@
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp
-     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
index 2df240c4..c7a71263 100644
--- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -152,7 +152,7 @@ namespace ck
                 CShuffleNXdlPerWavePerShuffle,
                 CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                 CDEBlockTransferScalarPerVector_NPerBlock,
-                ComputeDataType,
+                HIP_vector_type<ComputeDataType, 2>,
                 LoopSched>
 
                 : public DeviceContractionMultipleD<NumDimM,
@@ -172,23 +172,25 @@ namespace ck
                 using CDEElementwiseOperation = Bilinear;
 
                 // Complex types given through the interface
-                using ComplexA  = HIP_vector_type<ADataType, 2>;
-                using ComplexB  = HIP_vector_type<BDataType, 2>;
-                using ComplexDs = HIP_vector_type<DsDataType, 2>;
-                using ComplexE  = HIP_vector_type<EDataType, 2>;
+                using ComplexA       = HIP_vector_type<ADataType, 2>;
+                using ComplexB       = HIP_vector_type<BDataType, 2>;
+                using ComplexDs      = HIP_vector_type<DsDataType, 2>;
+                using ComplexE       = HIP_vector_type<EDataType, 2>;
+                using ComplexCompute = HIP_vector_type<ComputeDataType, 2>;
 
                 // Internal functional types we will use to
                 // decompose the complex types and operate on.
-                using DecompA  = ADataType;
-                using DecompB  = BDataType;
-                using DecompDs = DsDataType;
-                using DecompE  = EDataType;
+                using DecompA       = ADataType;
+                using DecompB       = BDataType;
+                using DecompDs      = DsDataType;
+                using DecompE       = EDataType;
+                using DecompCompute = ComputeDataType;
 
                 // For complex types, we need to make sure that all of the types are the same
                 static_assert(std::is_same_v<DecompA, DecompB> && std::is_same_v<DecompB, DecompDs>
                                   && std::is_same_v<DecompDs, DecompE>
-                                  && std::is_same_v<DecompE, ComputeDataType>
-                                  && std::is_same_v<ComputeDataType, CShuffleDataType>,
+                                  && std::is_same_v<DecompE, DecompCompute>
+                                  && std::is_same_v<DecompCompute, CShuffleDataType>,
                               "Complex operations must have the same data type");
 
                 static_assert(std::is_same_v<DecompA, float> || std::is_same_v<DecompA, double>,
@@ -297,6 +299,8 @@ namespace ck
                         elementsE
                             = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
 
+                        element_op = cde_element_op;
+
                         mA_real.reset(nullptr);
                         mA_imag.reset(nullptr);
                         mB_real.reset(nullptr);
@@ -367,6 +371,19 @@ namespace ck
                                 cde_element_op);
                         };
 
+                        mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, CDEElementwiseOperation{1.0f, 1.0f});
+                        mArgs[1] = allocArgs(mE_real,
+                                             mA_imag,
+                                             mB_imag,
+                                             mE_real,
+                                             CDEElementwiseOperation{-1.0f,
+                                                                     1.0f});
+                        mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, CDEElementwiseOperation{1.0f, 1.0f});
+                        mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag,
+                                             CDEElementwiseOperation{1.0f , 1.0f});
+
+                        // original
+                        /* TODO :Uncomment once done
                         mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op);
                         mArgs[1] = allocArgs(mE_real,
                                              mA_imag,
@@ -376,7 +393,7 @@ namespace ck
                                                                      1.0f});
                         mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op);
                         mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag,
-                                             CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f});
+                                             CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f});*/
                     }
 
                     void Print() const
@@ -408,6 +425,7 @@ namespace ck
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
 
+                    CDEElementwiseOperation element_op{1.0f, 1.0f};
                     void* mE_grid;
                     index_t elementsE;
                 };
@@ -448,8 +466,12 @@ namespace ck
                         {
                             auto blockDim = dim3(1024);
                             auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x));
-                            hiptensor::pack<<<gridDim, blockDim, 0>>>(
-                                arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
+                            hiptensor::mfma<<<gridDim, blockDim, 0>>>(
+                                arg.mE_real.get(), arg.mE_imag.get(), arg.mD_real.get(), arg.mD_imag.get(),
+                                ((ComplexE*)arg.mE_grid), arg.element_op.alpha_, arg.element_op.beta_,
+                                 arg.elementsE);
+                            //hiptensor::pack<<<gridDim, blockDim, 0>>>(
+                            //    arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
                         }
 
                         return r0 + r1 + r2 + r3;
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
index 03514f47..02e3834e 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
@@ -53,20 +53,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance
                     = device_contraction_kk_instance<CF32,
                                                      CF32,
                                                      F32,
                                                      F32,
                                                      CF32_Tuple,
                                                      CF32,
-                                                     F32,
+                                                     CF32,
                                                      PassThrough,
                                                      PassThrough,
                                                      Bilinear>;
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -77,11 +77,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Bilinear,
-                                                                               F32>>>& instances)
+                                                                               CF32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance{});
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp
rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
index bb1ccde5..742d49a6 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance
                     = device_contraction_kn_instance<CF32,
                                                      CF32,
                                                      F32,
                                                      F32,
                                                      CF32_Tuple,
                                                      CF32,
-                                                     F32,
+                                                     CF32,
                                                      PassThrough,
                                                      PassThrough,
                                                      Bilinear>;
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Bilinear,
-                                                                               F32>>>& instances)
+                                                                               CF32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance{});
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp
rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
index 2d47acc0..0f6b19d1 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance
                     = device_contraction_mk_instance<CF32,
                                                      CF32,
                                                      F32,
                                                      F32,
                                                      CF32_Tuple,
                                                      CF32,
-                                                     F32,
+                                                     CF32,
                                                      PassThrough,
                                                      PassThrough,
                                                      Bilinear>;
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Bilinear,
-                                                                               F32>>>& instances)
+                                                                               CF32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance{});
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp
rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
index 4c881c0a..184aea57 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance
                     = device_contraction_mn_instance<CF32,
                                                      CF32,
                                                      F32,
                                                      F32,
                                                      CF32_Tuple,
                                                      CF32,
-                                                     F32,
+                                                     CF32,
                                                      PassThrough,
                                                      PassThrough,
                                                      Bilinear>;
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Bilinear,
-                                                                               F32>>>& instances)
+                                                                               CF32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance{});
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp
rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
index ed2ba843..5be10230 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
@@ -53,20 +53,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance
                     = device_contraction_f64_kk_instance<CF64,
                                                          CF64,
                                                          F64,
                                                          F64,
                                                          CF64_Tuple,
                                                          CF64,
-                                                         F64,
+                                                         CF64,
                                                          PassThrough,
                                                          PassThrough,
                                                          Bilinear>;
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -77,11 +77,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Bilinear,
-                                                                               F64>>>& instances)
+                                                                               CF64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance{});
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp
rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
index 03dd9293..bf5c1667 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance
                     = device_contraction_f64_kn_instance<CF64,
                                                          CF64,
                                                          F64,
                                                          F64,
                                                          CF64_Tuple,
                                                          CF64,
-                                                         F64,
+                                                         CF64,
                                                          PassThrough,
                                                          PassThrough,
                                                          Bilinear>;
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Bilinear,
-                                                                               F64>>>& instances)
+                                                                               CF64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance{});
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
similarity index 92%
rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp
rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
index c44a5daf..e07e603e 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
@@ -52,35 +52,35 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance
                     = device_contraction_f64_mk_instance<CF64,
                                                          CF64,
                                                          F64,
                                                          F64,
                                                          CF64_Tuple,
                                                          CF64,
-                                                         F64,
+                                                         CF64,
                                                          PassThrough,
                                                          PassThrough,
                                                          Bilinear>;
 
                 void
-                   add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance(
+                   add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                              CF64,
+                                                                               CF64,
                                                                                CF64,
                                                                                CF64_Tuple,
                                                                                CF64,
-                                                                              PassThrough,
                                                                                PassThrough,
-                                                                              Bilinear,
-                                                                               F64>>>& instances)
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               CF64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance{});
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp
rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
index d045a404..3329307a 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance
                     = device_contraction_f64_mn_instance<CF64,
                                                          CF64,
                                                          F64,
                                                          F64,
                                                          CF64_Tuple,
                                                          CF64,
-                                                         F64,
+                                                         CF64,
                                                          PassThrough,
                                                          PassThrough,
                                                          Bilinear>;
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Bilinear,
-                                                                               F64>>>& instances)
+                                                                               CF64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance{});
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp
index f3a7fd2e..43a9358c 100644
--- a/library/src/contraction/device/device_contraction_scale_complex.hpp
+++ b/library/src/contraction/device/device_contraction_scale_complex.hpp
@@ -43,8 +43,8 @@ namespace ck
             using hiptensor::DeviceDeleter;
             using hiptensor::elementSpaceFromLengthsAndStrides;
 
-            using Bilinear = ck::tensor_operation::element_wise::Bilinear;
-            using Scale    = ck::tensor_operation::element_wise::Scale;
+            using Bilinear          = ck::tensor_operation::element_wise::Bilinear;
+            using Scale             = ck::tensor_operation::element_wise::Scale;
 
             // The following is a specialization class for bilinear contractions of complex types.
             // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
@@ -152,7 +152,7 @@ namespace ck
                 CShuffleNXdlPerWavePerShuffle,
                 CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                 CDEBlockTransferScalarPerVector_NPerBlock,
-                ComputeDataType,
+                HIP_vector_type<ComputeDataType, 2>,
                 LoopSched>
 
                 : public DeviceContractionMultipleD<NumDimM,
@@ -165,7 +165,7 @@ namespace ck
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
                                                     Scale,
-                                                    ComputeDataType>
+                                                    HIP_vector_type<ComputeDataType, 2>>
             {
                 // Complex device Op
                 using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
@@ -175,22 +175,24 @@ namespace ck
                 using BilinearCDEElementwiseOperation = Bilinear;
 
                 // Complex types given through the interface
-                using ComplexA  = HIP_vector_type<ADataType, 2>;
-                using ComplexB  = HIP_vector_type<BDataType, 2>;
-                using ComplexDs = HIP_vector_type<EDataType, 2>;
-                using ComplexE  = HIP_vector_type<EDataType, 2>;
+                using ComplexA       = HIP_vector_type<ADataType, 2>;
+                using ComplexB       = HIP_vector_type<BDataType, 2>;
+                using ComplexDs      = HIP_vector_type<EDataType, 2>;
+                using ComplexE       = HIP_vector_type<EDataType, 2>;
+                using ComplexCompute = HIP_vector_type<ComputeDataType, 2>;
 
                 // Internal functional types we will use to
                 // decompose the complex types and operate on.
-                using DecompA  = ADataType;
-                using DecompB  = BDataType;
-                using DecompDs = EDataType;
-                using DecompE  = EDataType;
+                using DecompA       = ADataType;
+                using DecompB       = BDataType;
+                using DecompDs      = EDataType;
+                using DecompE       = EDataType;
+                using DecompCompute = ComputeDataType;
 
                 // For complex types, we need to make sure that all of the types are the same
                 static_assert(std::is_same_v<DecompA, DecompB> && std::is_same_v<DecompB, DecompE>
-                                  && std::is_same_v<DecompE, ComputeDataType>
-                                  && std::is_same_v<ComputeDataType, CShuffleDataType>,
+                                  && std::is_same_v<DecompE, CShuffleDataType>
+                                  && std::is_same_v<DecompE, DecompCompute>,
                               "Complex operations must have the same data type");
 
                 static_assert(std::is_same_v<DecompA, float> || std::is_same_v<DecompA, double>,
@@ -243,7 +245,7 @@ namespace ck
                     CShuffleNXdlPerWavePerShuffle,
                     CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                     CDEBlockTransferScalarPerVector_NPerBlock,
-                    ComputeDataType,
+                    DecompCompute,
                     LoopSched>;
 
                 // The internal operation that we will decompose the complex operations with.
@@ -291,7 +293,7 @@ namespace ck
                     CShuffleNXdlPerWavePerShuffle,
                     CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                     CDEBlockTransferScalarPerVector_NPerBlock,
-                    ComputeDataType,
+                    DecompCompute,
                     LoopSched>;
 
                 // Argument
@@ -314,8 +316,8 @@ namespace ck
                         {
                             mScaleArgs[0]       = std::move(other.mScaleArgs[0]);
                             mScaleArgs[1]       = std::move(other.mScaleArgs[1]);
-                            mBilinearArgs[0] = std::move(other.mBilinearArgs[0]);
-                            mBilinearArgs[1] = std::move(other.mBilinearArgs[1]);
+                            mBilinearArgs[0]    = std::move(other.mBilinearArgs[0]);
+                            mBilinearArgs[1]    = std::move(other.mBilinearArgs[1]);
                         }
                         return *this;
                     }
@@ -346,6 +348,8 @@ namespace ck
                         elementsE
                             = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
 
+                        element_op = cde_element_op;
+
                         mA_real.reset(nullptr);
                         mA_imag.reset(nullptr);
                         mB_real.reset(nullptr);
@@ -445,7 +449,26 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op);
+                        mScaleArgs[0]    = allocScaleArgs(mE_real, mA_real, mB_real, ScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[0] = allocBilinearArgs(
+                            mE_real,
+                            mA_imag,
+                            mB_imag,
+                            mE_real,
+                            BilinearCDEElementwiseOperation{-1.0f, 1.0f});
+
+                        mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, ScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[1] = allocBilinearArgs(
+                            mE_imag,
+                            mA_imag,
+                            mB_real,
+                            mE_imag,
+                            BilinearCDEElementwiseOperation{1.0f, 1.0f});
+
+
+                        // TODO UNCOMMENT WHEN DONE
+                        // original
+                        /*mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op);
                         mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op);
                         mBilinearArgs[0] = allocBilinearArgs(
                             mE_real,
@@ -458,7 +481,7 @@ namespace ck
                             mA_imag,
                             mB_real,
                             mE_imag,
-                            BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});
+                            BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});*/
                     }
 
                     void Print() const
@@ -489,6 +512,7 @@ namespace ck
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
 
+                    ScaleCDEElementwiseOperation element_op{1.0};
                     void* mE_grid;
                     index_t elementsE;
                 };
@@ -532,8 +556,11 @@ namespace ck
                         {
                             auto blockDim = dim3(1024);
                             auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x));
-                            hiptensor::pack<<<gridDim, blockDim, 0>>>(
-                                arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
+
+                            hiptensor::multiply<<<gridDim, blockDim, 0>>>(
+                                arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.element_op.scale_, arg.elementsE);
+                            //hiptensor::pack<<<gridDim, blockDim, 0>>>(
+                            //    arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
                         }
 
                         return r0 + r1 + r2 + r3;
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
index 3352556d..9e9c8f9a 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
@@ -54,19 +54,19 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance
                     = device_contraction_kk_instance<CF32,
                                                      CF32,
                                                      F32,
                                                      F32,
                                                      Empty_Tuple,
                                                      CF32,
-                                                     F32,
+                                                     CF32,
                                                      PassThrough,
                                                      PassThrough,
                                                      Scale>;
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -77,11 +77,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Scale,
-                                                                               F32>>>& instances)
+                                                                               CF32>>>& instances)
                     {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance{});
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp
rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
index cfd6c7f4..b9183a21 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance
                     = device_contraction_kn_instance<CF32,
                                                      CF32,
                                                      F32,
                                                      F32,
                                                      Empty_Tuple,
                                                      CF32,
-                                                     F32,
+                                                     CF32,
                                                      PassThrough,
                                                      PassThrough,
                                                      Scale>;
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Scale,
-                                                                               F32>>>& instances)
+                                                                               CF32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance{});
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp
rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
index eacc1148..1f87031d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance
                     = device_contraction_mk_instance<CF32,
                                                      CF32,
                                                      F32,
-                                                    F32,
+                                                     F32,
                                                      Empty_Tuple,
                                                      CF32,
-                                                     F32,
+                                                     CF32,
                                                      PassThrough,
                                                      PassThrough,
                                                      Scale>;
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Scale,
-                                                                               F32>>>& instances)
+                                                                               CF32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance{});
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp
rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
index b5e79372..ef7724e0 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance
                     = device_contraction_mn_instance<CF32,
                                                      CF32,
                                                      F32,
                                                      F32,
                                                      Empty_Tuple,
                                                      CF32,
-                                                     F32,
+                                                     CF32,
                                                      PassThrough,
                                                      PassThrough,
                                                      Scale>;
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Scale,
-                                                                               F32>>>& instances)
+                                                                               CF32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance{});
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp
rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
index c0934498..e22aab5f 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
@@ -54,20 +54,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance
                     = device_contraction_f64_kk_instance<CF64,
                                                          CF64,
                                                          F64,
                                                          F64,
                                                          Empty_Tuple,
                                                          CF64,
-                                                         F64,
+                                                         CF64,
                                                          PassThrough,
                                                          PassThrough,
                                                          Scale>;
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -78,11 +78,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Scale,
-                                                                               F64>>>& instances)
+                                                                               CF64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance{});
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance{});
                 }
 
            } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp
rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
index 8514cb70..58ed790a 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance
                     = device_contraction_f64_kn_instance<CF64,
                                                          CF64,
                                                          F64,
                                                          F64,
                                                          Empty_Tuple,
                                                          CF64,
-                                                         F64,
+                                                         CF64,
                                                          PassThrough,
                                                          PassThrough,
                                                          Scale>;
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Scale,
-                                                                               F64>>>& instances)
+                                                                               CF64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                       device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance{});
+                       device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp
rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
index 09d589d6..562519f5 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance
                     = device_contraction_f64_mk_instance<CF64,
                                                          CF64,
                                                          F64,
                                                          F64,
                                                          Empty_Tuple,
                                                          CF64,
-                                                         F64,
+                                                         CF64,
                                                          PassThrough,
                                                          PassThrough,
                                                          Scale>;
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Scale,
-                                                                               F64>>>& instances)
+                                                                               CF64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance{});
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance{});
                 }
 
             } // namespace instance
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
similarity index 94%
rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp
rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
index 6b90050b..724d89cf 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
@@ -52,20 +52,20 @@ namespace ck
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
-                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance
                     = device_contraction_f64_mn_instance<CF64,
                                                          CF64,
                                                          F64,
                                                          F64,
                                                          Empty_Tuple,
                                                          CF64,
-                                                         F64,
+                                                         CF64,
                                                          PassThrough,
                                                          PassThrough,
                                                          Scale>;
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -76,11 +76,11 @@ namespace ck
                                                                                PassThrough,
                                                                                PassThrough,
                                                                                Scale,
-                                                                               F64>>>& instances)
+                                                                               CF64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
-                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance{});
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance{});
                 }
             } // namespace instance
         } // namespace device

From b50a1597edc306a24dd7580ca83f9d4e9f45836e Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Thu, 21 Dec 2023 12:20:48 -0500
Subject: [PATCH 34/42] Add complex bilinear and scale structures

---
 .../device/device_element_wise_complex.hpp    | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 library/src/contraction/device/device_element_wise_complex.hpp

diff --git a/library/src/contraction/device/device_element_wise_complex.hpp b/library/src/contraction/device/device_element_wise_complex.hpp
new file mode 100644
index 00000000..6dfd94e0
--- /dev/null
+++ b/library/src/contraction/device/device_element_wise_complex.hpp
@@ -0,0 +1,108 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP
+#define HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP
+
+#include <unary_element_wise_operation.hpp>
+#include <binary_element_wise_operation.hpp>
+#include <hip/hip_complex.h>
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+
+struct ScaleComplex : public Scale
+{
+    __host__ __device__ ScaleComplex(hipFloatComplex scale) : Scale(hipCrealf(scale))
+    {
+        scale_ = hipComplexFloatToDouble(scale);
+    }
+
+    __host__ __device__ ScaleComplex(hipDoubleComplex scale) : Scale(hipCreal(scale))
+    {
+        scale_ = scale;
+    }
+
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<hipFloatComplex, hipFloatComplex>(hipFloatComplex& y, const hipFloatComplex& x) const
+    {
+        y = hipCmulf(hipComplexDoubleToFloat(scale_), x);
+    };
+
+    template <>
+    __host__ __device__ void operator()<hipDoubleComplex, hipDoubleComplex>(hipDoubleComplex& y, const hipDoubleComplex& x) const
+    {
+        y = hipCmul(scale_, x);
+    };
+
+    // complex * float
+    hipDoubleComplex scale_;
+};
+
+struct BilinearComplex : public Bilinear
+{
+    BilinearComplex(hipFloatComplex alpha, hipFloatComplex beta) : Bilinear(hipCrealf(alpha), hipCrealf(beta))
+    {
+        alpha_ = hipComplexFloatToDouble(alpha);
+        beta_  = hipComplexFloatToDouble(beta);
+    }
+
+    BilinearComplex(hipDoubleComplex alpha, hipDoubleComplex beta) : Bilinear(hipCreal(alpha), hipCreal(beta))
+    {
+        alpha_  = alpha;
+        beta_   = beta;
+    }
+
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<hipDoubleComplex, hipDoubleComplex, hipDoubleComplex>(hipDoubleComplex& y, const hipDoubleComplex& x0, const hipDoubleComplex& x1) const
+    {
+        y = hipCadd(hipCmul(alpha_, x0), hipCmul(beta_, x1));
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<hipFloatComplex, hipFloatComplex, hipFloatComplex>(hipFloatComplex& y, const hipFloatComplex& x0, const hipFloatComplex& x1) const
+    {
+        y = hipCaddf(hipCmulf(hipComplexDoubleToFloat(alpha_), x0), hipCmulf(hipComplexDoubleToFloat(beta_), x1));
+    };
+
+    hipDoubleComplex alpha_;
+    hipDoubleComplex beta_;
+};
+
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+
+#endif // HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP

From b85aa93109b173a3066841eeb0d1d426b7fec539 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Thu, 21 Dec 2023 18:28:58 +0000
Subject: [PATCH 35/42] Support complex alpha and beta in contraction

---
 library/include/hiptensor/hiptensor_types.hpp |  2 +
 .../src/contraction/contraction_selection.cpp | 21 +++-
 .../contraction/contraction_solution_impl.hpp | 16 +--
 library/src/data_types.cpp                    | 98 ++++++++++++++++++-
 library/src/include/data_types.hpp            | 45 +++++++++
 library/src/include/data_types_impl.hpp       |  6 +-
 test/01_contraction/CMakeLists.txt            |  6 ++
 .../complex_bilinear_contraction_test.cpp     | 48 +++++++++
 .../configs/complex_bilinear_test_params.yaml | 37 +++++++
 9 files changed, 263 insertions(+), 16 deletions(-)
 create mode 100644 test/01_contraction/complex_bilinear_contraction_test.cpp
 create mode 100644 test/01_contraction/configs/complex_bilinear_test_params.yaml

diff --git a/library/include/hiptensor/hiptensor_types.hpp b/library/include/hiptensor/hiptensor_types.hpp
index 85a5d90e..ca666a5b 100644
--- a/library/include/hiptensor/hiptensor_types.hpp
+++ b/library/include/hiptensor/hiptensor_types.hpp
@@ -90,6 +90,8 @@ typedef enum
     HIPTENSOR_COMPUTE_8I   = (1U << 8U),
     HIPTENSOR_COMPUTE_32U  = (1U << 7U),
     HIPTENSOR_COMPUTE_32I  = (1U << 9U),
+    HIPTENSOR_COMPUTE_C32F = (1U << 11U),
+    HIPTENSOR_COMPUTE_C64F = (1U << 12U),
     HIPTENSOR_COMPUTE_NONE = 0
 } hiptensorComputeType_t;
 
diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index 9b0cdf9f..b2e54d80 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -81,10 +81,23 @@ namespace hiptensor
          * ```
          * Hence, the `alpha` and `bete` need to point to a ComputeData value
          */
-        double alpha = 0.0;
-        double beta  = 0.0;
-        writeVal(&alpha, computeType, 1.02);
-        writeVal(&beta, computeType, 1.03);
+        hipDoubleComplex alpha;
+        hipDoubleComplex beta;
+        if(computeType == HIPTENSOR_COMPUTE_C32F)
+        {
+            writeVal(&alpha, computeType, hipFloatComplex{1.02, 1.03});
+            writeVal(&beta, computeType, hipFloatComplex{1.04, 1.05});
+        }
+        else if(computeType == HIPTENSOR_COMPUTE_C64F)
+        {
+            writeVal(&alpha, computeType, hipDoubleComplex{1.02, 1.03});
+            writeVal(&beta, computeType, hipDoubleComplex{1.04, 1.05});
+        }
+        else
+        {
+            writeVal(&alpha, computeType, 1.02);
+            writeVal(&beta, computeType, 1.03);
+        }
 
         CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA));
         CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB));
diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp
index 263937c3..33b6a85e 100644
--- a/library/src/contraction/contraction_solution_impl.hpp
+++ b/library/src/contraction/contraction_solution_impl.hpp
@@ -90,17 +90,17 @@ namespace hiptensor
             auto* deviceOp = dynamic_cast<DeviceOp*>(Base::mDeviceOp.get());
 
             // Note: CK ALWAYS uses float for alpha / beta in contraction multipleD
-            auto alphaF = 0.0f;
-            auto betaF  = 0.0f;
+            ScalarData alphaF;
+            ScalarData betaF;
 
             if(alpha != nullptr)
             {
-                alphaF = hiptensor::readVal<float>(
+                alphaF = hiptensor::readVal<ScalarData>(
                     alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
             if(beta != nullptr)
             {
-                betaF = hiptensor::readVal<float>(
+                betaF = hiptensor::readVal<ScalarData>(
                     beta, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
 
@@ -125,7 +125,7 @@ namespace hiptensor
                 toCKVec(e_ms_ns_strides),
                 typename Traits::AOp{},
                 typename Traits::BOp{},
-                typename Traits::CDEOp{alphaF, betaF}));
+                typename Traits::CDEOp(alphaF, betaF)));
 
             // Attach the workspace pointer
             deviceOp->SetWorkSpacePointer(Base::mArgPtr.get(), workspacePtr);
@@ -203,11 +203,11 @@ namespace hiptensor
             auto* deviceOp = dynamic_cast<DeviceOp*>(Base::mDeviceOp.get());
 
             // Note: CK ALWAYS uses float for alpha / beta in contraction multipleD
-            auto alphaF = 0.0f;
+            ScalarData alphaF;
 
             if(alpha != nullptr)
             {
-                alphaF = hiptensor::readVal<float>(
+                alphaF = hiptensor::readVal<ScalarData>(
                     alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
 
@@ -232,7 +232,7 @@ namespace hiptensor
                                                           toCKVec(e_ms_ns_strides),
                                                           typename Traits::AOp{},
                                                           typename Traits::BOp{},
-                                                          typename Traits::CDEOp{alphaF}));
+                                                          typename Traits::CDEOp(alphaF)));
 
             // Attach the workspace pointer
             deviceOp->SetWorkSpacePointer(Base::mArgPtr.get(), workspacePtr);
diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp
index 09df158f..abaf7154 100644
--- a/library/src/data_types.cpp
+++ b/library/src/data_types.cpp
@@ -110,11 +110,11 @@ namespace hiptensor
         {
             return HIPTENSOR_COMPUTE_16F;
         }
-        else if(hipType == HIP_R_32F || hipType == HIP_C_32F)
+        else if(hipType == HIP_R_32F)
         {
             return HIPTENSOR_COMPUTE_32F;
         }
-        else if(hipType == HIP_R_64F || hipType == HIP_C_64F)
+        else if(hipType == HIP_R_64F)
         {
             return HIPTENSOR_COMPUTE_64F;
         }
@@ -134,12 +134,72 @@ namespace hiptensor
         {
             return HIPTENSOR_COMPUTE_32U;
         }
+        else if(hipType == HIP_C_32F)
+        {
+            return HIPTENSOR_COMPUTE_C32F;
+        }
+        else if(hipType == HIP_C_64F)
+        {
+            return HIPTENSOR_COMPUTE_C64F;
+        }
         else
         {
             return HIPTENSOR_COMPUTE_NONE;
         }
     }
 
+    template <>
+    ScalarData readVal(void const* value, hiptensorComputeType_t id)
+    {
+        if(id == HIPTENSOR_COMPUTE_16F)
+        {
+            return ScalarData(*(_Float16*)value, id);
+        }
+        else if(id == HIPTENSOR_COMPUTE_16BF)
+        {
+            return ScalarData(*(hip_bfloat16*)value, id);
+        }
+        else if(id == HIPTENSOR_COMPUTE_32F)
+        {
+            return ScalarData(*(float*)value, id);
+        }
+        else if(id == HIPTENSOR_COMPUTE_64F)
+        {
+            return ScalarData(*(double*)value, id);
+        }
+        else if(id == HIPTENSOR_COMPUTE_8U)
+        {
+            return ScalarData(*(uint8_t*)value, id);
+        }
+        else if(id == HIPTENSOR_COMPUTE_8I)
+        {
+            return ScalarData(*(int8_t*)value, id);
+        }
+        else if(id == HIPTENSOR_COMPUTE_32U)
+        {
+            return ScalarData(*(uint32_t*)value, id);
+        }
+        else if(id == HIPTENSOR_COMPUTE_32I)
+        {
+            return ScalarData(*(int32_t*)value, id);
+        }
+        else if(id == HIPTENSOR_COMPUTE_C32F)
+        {
+            return {*(hipFloatComplex*)value, id};
+        }
+        else if(id == HIPTENSOR_COMPUTE_C64F)
+        {
+            return {*(hipDoubleComplex*)value, id};
+        }
+        else
+        {
+#if !NDEBUG
+            std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
+#endif // !NDEBUG
+            return {0, HIPTENSOR_COMPUTE_NONE};
+        }
+    }
+
     void writeVal(void const* addr, hiptensorComputeType_t id, double value)
     {
         if(id == HIPTENSOR_COMPUTE_16F)
@@ -183,6 +243,40 @@ namespace hiptensor
         }
     }
 
+    void writeVal(void const* addr, hiptensorComputeType_t id, hipFloatComplex value)
+    {
+        if(id == HIPTENSOR_COMPUTE_C32F)
+        {
+            *(hipFloatComplex*)addr = value;
+        }
+        else
+        {
+#if !NDEBUG
+            std::cout << "Data type is hipFloatComplex, but hiptensorComputeType_t is not "
+                         "HIPTENSOR_COMPUTE_C32F: "
+                      << id << std::endl;
+#endif // !NDEBUG
+            return;
+        }
+    }
+
+    void writeVal(void const* addr, hiptensorComputeType_t id, hipDoubleComplex value)
+    {
+        if(id == HIPTENSOR_COMPUTE_C64F)
+        {
+            *(hipDoubleComplex*)addr = value;
+        }
+        else
+        {
+#if !NDEBUG
+            std::cout << "Data type is hipDoubleComplex, but hiptensorComputeType_t is not "
+                         "HIPTENSOR_COMPUTE_C64F: "
+                      << id << std::endl;
+#endif // !NDEBUG
+            return;
+        }
+    }
+
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType)
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
index 97402fa3..aa2eaa40 100644
--- a/library/src/include/data_types.hpp
+++ b/library/src/include/data_types.hpp
@@ -44,6 +44,49 @@ namespace hiptensor
     // Used to map to empty tensors
     struct NoneType;
 
+    struct ScalarData
+    {
+        hiptensorComputeType_t type;
+        union
+        {
+            double           real;
+            hipDoubleComplex complex;
+        };
+
+        ScalarData() = default;
+        ScalarData(double value, hiptensorComputeType_t type)
+            : real(value)
+            , type(type)
+        {
+        }
+        ScalarData(hipFloatComplex value, hiptensorComputeType_t type)
+            : complex(hipComplexFloatToDouble(value))
+            , type(type)
+        {
+        }
+        ScalarData(hipDoubleComplex value, hiptensorComputeType_t type)
+            : complex(value)
+            , type(type)
+        {
+        }
+        operator float() const
+        {
+            return static_cast<float>(real);
+        }
+        operator double() const
+        {
+            return real;
+        }
+        operator hipFloatComplex() const
+        {
+            return hipComplexDoubleToFloat(complex);
+        }
+        operator hipDoubleComplex() const
+        {
+            return complex;
+        }
+    };
+
     static constexpr hipDataType NONE_TYPE = (hipDataType)31;
 
     // Map type to runtime HipDataType
@@ -67,6 +110,8 @@ namespace hiptensor
     T readVal(void const* value, hiptensorComputeType_t id);
 
     void writeVal(void const* addr, hiptensorComputeType_t id, double value);
+    void writeVal(void const* addr, hiptensorComputeType_t id, hipDoubleComplex value);
+    void writeVal(void const* addr, hiptensorComputeType_t id, hipFloatComplex value);
 
 } // namespace hiptensor
 
diff --git a/library/src/include/data_types_impl.hpp b/library/src/include/data_types_impl.hpp
index ef3e7c77..c55f0d7e 100644
--- a/library/src/include/data_types_impl.hpp
+++ b/library/src/include/data_types_impl.hpp
@@ -174,11 +174,11 @@ namespace hiptensor
         {
             return static_cast<T>(*(uint64_t*)value);
         }
-        else if constexpr(std::is_same_v<T,hipFloatComplex> && id == HIP_C_32F)
+        else if constexpr(std::is_same_v<T, hipFloatComplex> && id == HIP_C_32F)
         {
             return static_cast<T>(*(hipFloatComplex*)value);
         }
-        else if constexpr(std::is_same_v<T,hipDoubleComplex> && id == HIP_C_64F)
+        else if constexpr(std::is_same_v<T, hipDoubleComplex> && id == HIP_C_64F)
         {
             return static_cast<T>(*(hipDoubleComplex*)value);
         }
@@ -235,6 +235,8 @@ namespace hiptensor
         }
     }
 
+    template <>
+    ScalarData readVal(void const* value, hiptensorComputeType_t id);
 } // namespace hiptensor
 
 #endif // HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP
diff --git a/test/01_contraction/CMakeLists.txt b/test/01_contraction/CMakeLists.txt
index fe2d7a87..1e0e3c0a 100644
--- a/test/01_contraction/CMakeLists.txt
+++ b/test/01_contraction/CMakeLists.txt
@@ -33,6 +33,12 @@ set (BilinearContractionTestSources ${ContractionCommonSources}
 set (BilinearContractionTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/bilinear_test_params.yaml)
 add_hiptensor_test(bilinear_contraction_test ${BilinearContractionTestConfig}  ${BilinearContractionTestSources})
 
+# Complex Bilinear tests
+set (ComplexBilinearContractionTestSources ${ContractionCommonSources}
+    ${CMAKE_CURRENT_SOURCE_DIR}/complex_bilinear_contraction_test.cpp)
+set (ComplexBilinearContractionTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/complex_bilinear_test_params.yaml)
+add_hiptensor_test(complex_bilinear_contraction_test ${ComplexBilinearContractionTestConfig}  ${ComplexBilinearContractionTestSources})
+
 # Scale tests
 set (ScaleContractionTestSources ${ContractionCommonSources}
                                     ${CMAKE_CURRENT_SOURCE_DIR}/scale_contraction_test.cpp)
diff --git a/test/01_contraction/complex_bilinear_contraction_test.cpp b/test/01_contraction/complex_bilinear_contraction_test.cpp
new file mode 100644
index 00000000..51e95c34
--- /dev/null
+++ b/test/01_contraction/complex_bilinear_contraction_test.cpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+
+#include "contraction_test.hpp"
+#include "contraction_test_helpers.hpp"
+
+class ComplexBilinearContractionTest : public hiptensor::ContractionTest
+{
+};
+
+TEST_P(ComplexBilinearContractionTest, RunKernel)
+{
+    static bool ranWarmup = false;
+    if(!ranWarmup)
+    {
+        this->Warmup();
+        ranWarmup = true;
+    }
+    this->RunKernel();
+}
+
+INSTANTIATE_TEST_SUITE_P(ContractionTests, ComplexBilinearContractionTest, load_config_helper());
diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml
new file mode 100644
index 00000000..cbaee86a
--- /dev/null
+++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml
@@ -0,0 +1,37 @@
+---
+Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
+Tensor Data Types:
+  - [ HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_32F ]
+  - [ HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ]
+  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ]
+  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ]
+  - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_R_32F ]
+  - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_R_64F ]
+Algorithm Types:
+  - HIPTENSOR_ALGO_DEFAULT
+  - HIPTENSOR_ALGO_DEFAULT_PATIENT
+  # - HIPTENSOR_ALGO_ACTOR_CRITIC
+Operators:
+  - HIPTENSOR_OP_IDENTITY
+Worksize Prefs:
+  - HIPTENSOR_WORKSPACE_RECOMMENDED
+  - HIPTENSOR_WORKSPACE_MIN
+  - HIPTENSOR_WORKSPACE_MAX
+Alphas:
+  - 0
+  - 1
+  - 1
+Betas:
+  - 2
+  - 0
+  - 2
+Lengths:
+  - [ 5, 6, 3, 4, 3, 4 ]
+  - [ 4, 3, 4, 3, 6, 5 ]
+  - [ 24, 18, 2, 4, 9, 2 ]
+Strides:
+  - []
+...

From 1b2031ecfdfb6e614c081853d86c3b070a21fbf3 Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Thu, 21 Dec 2023 13:40:44 -0500
Subject: [PATCH 36/42] Modify base class compute type

---
 .../device/device_contraction_bilinear_complex.hpp            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
index c7a71263..fc050e50 100644
--- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -165,7 +165,7 @@ namespace ck
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
                                                     Bilinear,
-                                                    ComputeDataType>
+                                                    HIP_vector_type<ComputeDataType, 2>>
             {
                 // Complex device Op
                 using DeviceOp                = DeviceContractionMultipleD_Xdl_CShuffle;
@@ -243,7 +243,7 @@ namespace ck
                     CShuffleNXdlPerWavePerShuffle,
                     CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
                     CDEBlockTransferScalarPerVector_NPerBlock,
-                    ComputeDataType,
+                    DecompCompute,
                     LoopSched>;
 
                 // Argument

From 41c44b26d3d9e4cfb17402418f4a88bba6338fef Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Thu, 21 Dec 2023 14:13:12 -0500
Subject: [PATCH 37/42] Modify cpu and gpu device instances for complex

---
 .../contraction_cpu_reference_impl.hpp        | 38 +++++++++-
 .../contraction_cpu_reference_instances.cpp   | 64 +---------------
 .../contraction/contraction_meta_traits.hpp   | 24 ++++--
 .../src/contraction/contraction_pack_util.hpp | 31 +++++---
 .../src/contraction/contraction_solution.hpp  |  2 +
 .../contraction/contraction_solution_impl.hpp | 12 ++-
 .../contraction_solution_instances.cpp        | 16 ++--
 library/src/contraction/contraction_types.hpp |  2 +
 .../contraction/contraction_types_impl.hpp    | 13 ++++
 library/src/contraction/device/common.hpp     |  2 +
 .../device_contraction_bilinear_complex.hpp   | 30 ++++----
 ...2_cf32_cf32_compute_cf32_kknn_instance.cpp | 11 +--
 ...2_cf32_cf32_compute_cf32_knnn_instance.cpp | 11 +--
 ...2_cf32_cf32_compute_cf32_mknn_instance.cpp | 11 +--
 ...2_cf32_cf32_compute_cf32_mnnn_instance.cpp | 11 +--
 ...4_cf64_cf64_compute_cf64_kknn_instance.cpp | 11 +--
 ...4_cf64_cf64_compute_cf64_knnn_instance.cpp | 11 +--
 ...4_cf64_cf64_compute_cf64_mknn_instance.cpp | 11 +--
 ...4_cf64_cf64_compute_cf64_mnnn_instance.cpp | 11 +--
 .../device_contraction_scale_complex.hpp      | 30 ++++----
 ...32_cf32_cf32_compute_cf32_kkn_instance.cpp | 11 +--
 ...32_cf32_cf32_compute_cf32_knn_instance.cpp | 11 +--
 ...32_cf32_cf32_compute_cf32_mkn_instance.cpp | 11 +--
 ...32_cf32_cf32_compute_cf32_mnn_instance.cpp | 11 +--
 ...64_cf64_cf64_compute_cf64_kkn_instance.cpp | 11 +--
 ...64_cf64_cf64_compute_cf64_knn_instance.cpp | 11 +--
 ...64_cf64_cf64_compute_cf64_mkn_instance.cpp | 11 +--
 ...64_cf64_cf64_compute_cf64_mnn_instance.cpp | 11 +--
 ...device_element_wise_operation_complex.hpp} | 17 +----
 ...ptensor_contraction_bilinear_instances.hpp | 76 ++++++++++---------
 .../hiptensor_contraction_scale_instances.hpp | 74 +++++++++---------
 library/src/include/meta_traits.hpp           |  2 +-
 32 files changed, 323 insertions(+), 286 deletions(-)
 rename library/src/contraction/device/{device_element_wise_complex.hpp => device_element_wise_operation_complex.hpp} (87%)

diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index 25c317e3..2f031bb0 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -200,7 +200,20 @@ namespace hiptensor
                             {
                                 ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.scale_ * (EDataType)accum;
                             }
-                            else // bilinear
+                            else if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                             ck::tensor_operation::element_wise::ScaleComplex>)
+                            {
+                                if constexpr(std::is_same_v<EDataType, hipFloatComplex>)
+                                {
+                                    ((EDataType*)arg.mE)[indexE] = hipCmulf(hipComplexDoubleToFloat(arg.mOpCDE.scale_), (EDataType)accum);
+                                }
+                                else
+                                {
+                                    ((EDataType*)arg.mE)[indexE] = hipCmul(arg.mOpCDE.scale_, (EDataType)accum);
+                                }
+                            }
+                            else if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                             ck::tensor_operation::element_wise::Bilinear>)
                             {
                                 // NumDTensor will be 1 due to SFINAE of this class
                                 auto indexD
@@ -209,6 +222,29 @@ namespace hiptensor
                                 ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.alpha_ * (EDataType)accum +
                                                                arg.mOpCDE.beta_ * ((EDataType*)(arg.mD[0]))[indexD];
                             }
+                            else if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                             ck::tensor_operation::element_wise::BilinearComplex>)
+                            {
+                                // NumDTensor will be 1 due to SFINAE of this class
+                                auto indexD
+                                    = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
+
+                                if constexpr(std::is_same_v<EDataType, hipFloatComplex>)
+                                {
+                                    ((EDataType*)arg.mE)[indexE] = hipCaddf(
+                                                                            hipCmulf(
+                                                                                    hipComplexDoubleToFloat(arg.mOpCDE.alpha_),
+                                                                                    (EDataType)accum),
+                                                                            hipCmulf(
+                                                                                    hipComplexDoubleToFloat(arg.mOpCDE.beta_),
+                                                                                    ((EDataType*)(arg.mD[0]))[indexD]));
+                                }
+                                else
+                                {
+                                    ((EDataType*)arg.mE)[indexE] = hipCadd(hipCmul(arg.mOpCDE.alpha_, (EDataType)accum),
+                                                                           hipCmul(arg.mOpCDE.beta_, ((EDataType*)(arg.mD[0]))[indexD]));
+                                }
+                            }
                         };
 
                     make_ParallelTensorFunctor(f_ms_ns_complex,
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index d2fd77fa..60c1ce49 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -117,21 +117,7 @@ namespace hiptensor
                                         hipFloatComplex,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear,
-                                        float>());
-
-        registerSolutions(
-            enumerateReferenceSolutions<2,
-                                        2,
-                                        2,
-                                        hipFloatComplex,
-                                        hipFloatComplex,
-                                        float,
-                                        ck::Tuple<hipFloatComplex>,
-                                        hipFloatComplex,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear,
+                                        ck::tensor_operation::element_wise::BilinearComplex,
                                         hipFloatComplex>());
 
         // Bilinear f64
@@ -175,21 +161,7 @@ namespace hiptensor
                                         hipDoubleComplex,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear,
-                                        double>());
-
-        registerSolutions(
-            enumerateReferenceSolutions<2,
-                                        2,
-                                        2,
-                                        hipDoubleComplex,
-                                        hipDoubleComplex,
-                                        double,
-                                        ck::Tuple<hipDoubleComplex>,
-                                        hipDoubleComplex,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear,
+                                        ck::tensor_operation::element_wise::BilinearComplex,
                                         hipDoubleComplex>());
 
         // Scale f16
@@ -277,21 +249,7 @@ namespace hiptensor
                                         hipFloatComplex,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale,
-                                        float>());
-
-        registerSolutions(
-            enumerateReferenceSolutions<2,
-                                        2,
-                                        2,
-                                        hipFloatComplex,
-                                        hipFloatComplex,
-                                        float,
-                                        ck::Tuple<>,
-                                        hipFloatComplex,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale,
+                                        ck::tensor_operation::element_wise::ScaleComplex,
                                         hipFloatComplex>());
 
         // Scale f64
@@ -335,21 +293,7 @@ namespace hiptensor
                                         hipDoubleComplex,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale,
-                                        double>());
-
-        registerSolutions(
-            enumerateReferenceSolutions<2,
-                                        2,
-                                        2,
-                                        hipDoubleComplex,
-                                        hipDoubleComplex,
-                                        double,
-                                        ck::Tuple<>,
-                                        hipDoubleComplex,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale,
+                                        ck::tensor_operation::element_wise::ScaleComplex,
                                         hipDoubleComplex>());
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index e66ac432..48508c6e 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -34,12 +34,12 @@
 #include <element_wise_operation.hpp>
 
 // hiptensor includes
+#include "device/device_element_wise_operation_complex.hpp"
 #include "data_types.hpp"
 #include "meta_traits.hpp"
 
 namespace hiptensor
 {
-
     // Partial specialize for Bilinear contraction
     template <ck::index_t NumDimsM,
               ck::index_t NumDimsN,
@@ -50,6 +50,7 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
+              typename CDEElementwiseOperation,
               typename ComputeDataType>
     struct MetaTraits<ck::tensor_operation::device::DeviceContractionMultipleD<
         NumDimsM,
@@ -61,8 +62,12 @@ namespace hiptensor
         EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        ck::tensor_operation::element_wise::Bilinear,
-        ComputeDataType>>
+        CDEElementwiseOperation,
+        ComputeDataType>,
+        std::enable_if_t<(std::is_same_v<CDEElementwiseOperation,
+                                         ck::tensor_operation::element_wise::Bilinear>) ||
+                         (std::is_same_v<CDEElementwiseOperation,
+                                         ck::tensor_operation::element_wise::BilinearComplex>)>>
     {
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
@@ -88,7 +93,7 @@ namespace hiptensor
                                                 ComputeDataType>;
         using AOp          = AElementwiseOperation;
         using BOp          = BElementwiseOperation;
-        using CDEOp        = ck::tensor_operation::element_wise::Bilinear;
+        using CDEOp        = CDEElementwiseOperation;
     };
 
     // Partial specialize for Scale contraction
@@ -100,6 +105,7 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
+              typename CDEElementwiseOperation,
               typename ComputeDataType>
     struct MetaTraits<ck::tensor_operation::device::DeviceContractionMultipleD<
         NumDimsM,
@@ -111,8 +117,12 @@ namespace hiptensor
         EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        ck::tensor_operation::element_wise::Scale,
-        ComputeDataType>>
+        CDEElementwiseOperation,
+        ComputeDataType>,
+        std::enable_if_t<(std::is_same_v<CDEElementwiseOperation,
+                                         ck::tensor_operation::element_wise::Scale>) ||
+                         (std::is_same_v<CDEElementwiseOperation,
+                                         ck::tensor_operation::element_wise::ScaleComplex>)>>
     {
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
@@ -129,7 +139,7 @@ namespace hiptensor
                                                 ComputeDataType>;
         using AOp          = AElementwiseOperation;
         using BOp          = BElementwiseOperation;
-        using CDEOp        = ck::tensor_operation::element_wise::Scale;
+        using CDEOp        = CDEElementwiseOperation;
     };
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp
index bcc99398..237e9d7f 100644
--- a/library/src/contraction/contraction_pack_util.hpp
+++ b/library/src/contraction/contraction_pack_util.hpp
@@ -39,8 +39,8 @@ namespace hiptensor
      */
     template <typename DataType>
     __global__ void mfma(DataType* mE_real, DataType* mE_imag, DataType* mD_real, DataType* mD_imag,
-                         HIP_vector_type<DataType, 2> *mE_grid, HIP_vector_type<DataType, 2> alpha,
-                         HIP_vector_type<DataType, 2> beta, int length)
+                         HIP_vector_type<DataType, 2> *mE_grid, HIP_vector_type<double, 2> alpha,
+                         HIP_vector_type<double, 2> beta, int length)
     {
         int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -48,13 +48,22 @@ namespace hiptensor
         {
             if constexpr(std::is_same_v<DataType, float>)
             {
-                mE_grid[idx] = hipCaddf(hipCmulf(make_hipFloatComplex(mE_real[idx], mE_imag[idx]), alpha),
-                                        hipCmulf(make_hipFloatComplex(mD_real[idx], mD_imag[idx]), beta));
+                mE_grid[idx] = hipCaddf(
+                                        hipCmulf(
+                                                make_hipFloatComplex(mE_real[idx], mE_imag[idx]),
+                                                hipComplexDoubleToFloat(alpha)),
+                                        hipCmulf(
+                                                make_hipFloatComplex(mD_real[idx], mD_imag[idx]),
+                                                hipComplexDoubleToFloat(beta)));
             }
             else if constexpr(std::is_same_v<DataType, double>)
             {
-                mE_grid[idx] = hipCadd(hipCmul(make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), alpha),
-                                       hipCmul(make_hipDoubleComplex(mD_real[idx], mD_imag[idx]), beta));
+                mE_grid[idx] = hipCadd(hipCmul(
+                                              make_hipDoubleComplex(mE_real[idx], mE_imag[idx]),
+                                              alpha),
+                                       hipCmul(
+                                              make_hipDoubleComplex(mD_real[idx], mD_imag[idx]),
+                                              beta));
            }
         }
     }
@@ -65,7 +74,7 @@ namespace hiptensor
      */
     template <typename DataType>
     __global__ void multiply(DataType* mE_real, DataType* mE_imag, HIP_vector_type<DataType, 2> *mE_grid,
-                             HIP_vector_type<DataType, 2> alpha, int length)
+                             HIP_vector_type<double, 2> alpha, int length)
     {
         int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -73,11 +82,15 @@ namespace hiptensor
         {
             if constexpr(std::is_same_v<DataType, float>)
             {
-                mE_grid[idx] = hipCmulf(make_hipFloatComplex(mE_real[idx], mE_imag[idx]), alpha);
+                mE_grid[idx] = hipCmulf(
+                                      make_hipFloatComplex(mE_real[idx], mE_imag[idx]),
+                                      hipComplexDoubleToFloat(alpha));
             }
             else if constexpr(std::is_same_v<DataType, double>)
             {
-                mE_grid[idx] = hipCmul(make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), alpha);
+                mE_grid[idx] = hipCmul(
+                                    make_hipDoubleComplex(mE_real[idx], mE_imag[idx]),
+                                    alpha);
            }
         }
     }
diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp
index e76bb351..97dde1ca 100644
--- a/library/src/contraction/contraction_solution.hpp
+++ b/library/src/contraction/contraction_solution.hpp
@@ -38,6 +38,8 @@
 #include <device_contraction_multiple_d.hpp>
 #include <element_wise_operation.hpp>
 
+#include "device/device_element_wise_operation_complex.hpp"
+
 #include "contraction_meta_traits.hpp"
 #include "contraction_solution_params.hpp"
 #include "performance.hpp"
diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp
index 33b6a85e..09e300a7 100644
--- a/library/src/contraction/contraction_solution_impl.hpp
+++ b/library/src/contraction/contraction_solution_impl.hpp
@@ -52,8 +52,10 @@ namespace hiptensor
     template <typename DeviceOp>
     class ContractionSolutionImpl<
         DeviceOp,
-        std::enable_if_t<std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
-                                        ck::tensor_operation::element_wise::Bilinear>>>
+        std::enable_if_t<(std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
+                                        ck::tensor_operation::element_wise::Bilinear>)
+                          || (std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
+                                        ck::tensor_operation::element_wise::BilinearComplex>)>>
         : public ContractionSolution
     {
     public:
@@ -165,8 +167,10 @@ namespace hiptensor
     template <typename DeviceOp>
     class ContractionSolutionImpl<
         DeviceOp,
-        std::enable_if_t<std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
-                                        ck::tensor_operation::element_wise::Scale>>>
+        std::enable_if_t<(std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
+                                        ck::tensor_operation::element_wise::Scale>)
+                          || (std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
+                                        ck::tensor_operation::element_wise::ScaleComplex>)>>
         : public ContractionSolution
     {
     public:
diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index 2cec41bc..ad5b4408 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -116,8 +116,8 @@ namespace hiptensor
                                           hipFloatComplex,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear,
-                                          float>());
+                                          ck::tensor_operation::element_wise::BilinearComplex,
+                                          hipFloatComplex>());
 
         // Bilinear f64
         registerSolutions(
@@ -156,8 +156,8 @@ namespace hiptensor
                                           hipDoubleComplex,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear,
-                                          double>());
+                                          ck::tensor_operation::element_wise::BilinearComplex,
+                                          hipDoubleComplex>());
 
         // Scale bf16
         registerSolutions(
@@ -238,8 +238,8 @@ namespace hiptensor
                                           hipFloatComplex,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale,
-                                          float>());
+                                          ck::tensor_operation::element_wise::ScaleComplex,
+                                          hipFloatComplex>());
 
         // Scale f64
         registerSolutions(
@@ -278,8 +278,8 @@ namespace hiptensor
                                           hipDoubleComplex,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale,
-                                          double>());
+                                          ck::tensor_operation::element_wise::ScaleComplex,
+                                          hipDoubleComplex>());
 
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_types.hpp b/library/src/contraction/contraction_types.hpp
index 101d72dc..e4930726 100644
--- a/library/src/contraction/contraction_types.hpp
+++ b/library/src/contraction/contraction_types.hpp
@@ -40,6 +40,8 @@ namespace hiptensor
     {
         SCALE    = 0, ///< \f${C=\alpha\mathcal{A}\mathcal{B}}\f$
         BILINEAR = 1, ///< \f${D=\alpha\mathcal{A}\mathcal{B}+\beta\mathcal{C}}\f$
+        SCALE_COMPLEX = 2,
+        BILINEAR_COMPLEX = 3,
         UNKNOWN,
     };
 
diff --git a/library/src/contraction/contraction_types_impl.hpp b/library/src/contraction/contraction_types_impl.hpp
index d8fa0f74..070718cc 100644
--- a/library/src/contraction/contraction_types_impl.hpp
+++ b/library/src/contraction/contraction_types_impl.hpp
@@ -32,6 +32,7 @@
 #include <contraction_scale.hpp>
 #include <element_wise_operation.hpp>
 
+#include "device/device_element_wise_operation_complex.hpp"
 #include "contraction_types.hpp"
 #include <hiptensor/hiptensor_types.hpp>
 
@@ -51,12 +52,24 @@ namespace hiptensor
         static constexpr auto value = ContractionOpId_t::SCALE;
     };
 
+    template <>
+    struct ContractionOperatorType<ck::tensor_operation::element_wise::ScaleComplex>
+    {
+        static constexpr auto value = ContractionOpId_t::SCALE_COMPLEX;
+    };
+
     template <>
     struct ContractionOperatorType<ck::tensor_operation::element_wise::Bilinear>
     {
         static constexpr auto value = ContractionOpId_t::BILINEAR;
     };
 
+    template <>
+    struct ContractionOperatorType<ck::tensor_operation::element_wise::BilinearComplex>
+    {
+        static constexpr auto value = ContractionOpId_t::BILINEAR_COMPLEX;
+    };
+
 } // namespace hiptensor
 
 #endif // HIPTENSOR_CONTRACTION_TYPES_IMPL_HPP
diff --git a/library/src/contraction/device/common.hpp b/library/src/contraction/device/common.hpp
index f530b2e2..efd4866c 100644
--- a/library/src/contraction/device/common.hpp
+++ b/library/src/contraction/device/common.hpp
@@ -39,4 +39,6 @@
 #include <element_wise_operation.hpp>
 #include <gemm_specialization.hpp>
 
+#include "device_element_wise_operation_complex.hpp"
+
 #endif // CONTRACTION_DEVICE_COMMON_HPP
diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
index fc050e50..7fc09504 100644
--- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -43,7 +43,8 @@ namespace ck
             using hiptensor::DeviceDeleter;
             using hiptensor::elementSpaceFromLengthsAndStrides;
 
-            using Bilinear = ck::tensor_operation::element_wise::Bilinear;
+            using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex;
+            using Bilinear        = ck::tensor_operation::element_wise::Bilinear;
 
             // The following is a specialization class for bilinear contractions of complex types.
             // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
@@ -121,7 +122,7 @@ namespace ck
                 HIP_vector_type<EDataType, 2>,
                 AElementwiseOperation,
                 BElementwiseOperation,
-                Bilinear,
+                BilinearComplex,
                 GemmSpec,
                 NumGemmKPrefetchStage,
                 BlockSize,
@@ -164,12 +165,13 @@ namespace ck
                                                     HIP_vector_type<EDataType, 2>,
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
-                                                    Bilinear,
+                                                    BilinearComplex,
                                                     HIP_vector_type<ComputeDataType, 2>>
             {
                 // Complex device Op
-                using DeviceOp                = DeviceContractionMultipleD_Xdl_CShuffle;
-                using CDEElementwiseOperation = Bilinear;
+                using DeviceOp                      = DeviceContractionMultipleD_Xdl_CShuffle;
+                using CDEElementwiseOperation       = BilinearComplex;
+                using DecompCDEElementwiseOperation = Bilinear;
 
                 // Complex types given through the interface
                 using ComplexA       = HIP_vector_type<ADataType, 2>;
@@ -212,7 +214,7 @@ namespace ck
                     DecompE,
                     AElementwiseOperation,
                     BElementwiseOperation,
-                    CDEElementwiseOperation,
+                    DecompCDEElementwiseOperation,
                     GemmSpec,
                     NumGemmKPrefetchStage,
                     BlockSize,
@@ -285,7 +287,7 @@ namespace ck
                              const std::vector<index_t>&                         e_ms_ns_strides,
                              AElementwiseOperation                               a_element_op,
                              BElementwiseOperation                               b_element_op,
-                             CDEElementwiseOperation                             cde_element_op)
+                             CDEElementwiseOperation                             cde_element_op) : element_op(cde_element_op)
                     {
                         // Take the incoming arguments, treat them as complex.
 
@@ -299,8 +301,6 @@ namespace ck
                         elementsE
                             = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
 
-                        element_op = cde_element_op;
-
                         mA_real.reset(nullptr);
                         mA_imag.reset(nullptr);
                         mB_real.reset(nullptr);
@@ -371,16 +371,16 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, CDEElementwiseOperation{1.0f, 1.0f});
+                        mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, DecompCDEElementwiseOperation{1.0f, 1.0f});
                         mArgs[1] = allocArgs(mE_real,
                                              mA_imag,
                                              mB_imag,
                                              mE_real,
-                                             CDEElementwiseOperation{-1.0f,
+                                             DecompCDEElementwiseOperation{-1.0f,
                                                                      1.0f});
-                        mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, CDEElementwiseOperation{1.0f, 1.0f});
+                        mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, DecompCDEElementwiseOperation{1.0f, 1.0f});
                         mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag,
-                                             CDEElementwiseOperation{1.0f , 1.0f});
+                                             DecompCDEElementwiseOperation{1.0f , 1.0f});
 
                         // original
                         /* TODO :Uncomment once done
@@ -425,7 +425,7 @@ namespace ck
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
 
-                    CDEElementwiseOperation element_op{1.0f, 1.0f};
+                    CDEElementwiseOperation element_op;
                     void* mE_grid;
                     index_t elementsE;
                 };
@@ -469,7 +469,7 @@ namespace ck
                             hiptensor::mfma<<<gridDim, blockDim, 0>>>(
                                 arg.mE_real.get(), arg.mE_imag.get(), arg.mD_real.get(), arg.mD_imag.get(),
                                 ((ComplexE*)arg.mE_grid), arg.element_op.alpha_, arg.element_op.beta_,
-                                 arg.elementsE);
+                                arg.elementsE);
                             //hiptensor::pack<<<gridDim, blockDim, 0>>>(
                             //    arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
                         }
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
index 02e3834e..4601021e 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
@@ -47,9 +47,10 @@ namespace ck
         {
             namespace instance
             {
-                using F32        = float;
-                using CF32       = hipFloatComplex;
-                using CF32_Tuple = ck::Tuple<CF32>;
+                using F32               = float;
+                using CF32              = hipFloatComplex;
+                using CF32_Tuple        = ck::Tuple<CF32>;
+                using BilinearComplex   = element_wise::BilinearComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
@@ -63,7 +64,7 @@ namespace ck
                                                      CF32,
                                                      PassThrough,
                                                      PassThrough,
-                                                     Bilinear>;
+                                                     BilinearComplex>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance(
@@ -76,7 +77,7 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
+                                                                               BilinearComplex,
                                                                                CF32>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
index 742d49a6..e3f60146 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F32        = float;
-                using CF32       = hipFloatComplex;
-                using CF32_Tuple = ck::Tuple<CF32>;
+                using F32               = float;
+                using CF32              = hipFloatComplex;
+                using CF32_Tuple        = ck::Tuple<CF32>;
+                using BilinearComplex   = element_wise::BilinearComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                      CF32,
                                                      PassThrough,
                                                      PassThrough,
-                                                     Bilinear>;
+                                                     BilinearComplex>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
+                                                                               BilinearComplex,
                                                                                CF32>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
index 0f6b19d1..c2fd7c84 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F32        = float;
-                using CF32       = hipFloatComplex;
-                using CF32_Tuple = ck::Tuple<CF32>;
+                using F32               = float;
+                using CF32              = hipFloatComplex;
+                using CF32_Tuple        = ck::Tuple<CF32>;
+                using BilinearComplex   = element_wise::BilinearComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                      CF32,
                                                      PassThrough,
                                                      PassThrough,
-                                                     Bilinear>;
+                                                     BilinearComplex>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
+                                                                               BilinearComplex,
                                                                                CF32>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
index 184aea57..8203a4e5 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F32        = float;
-                using CF32       = hipFloatComplex;
-                using CF32_Tuple = ck::Tuple<CF32>;
+                using F32               = float;
+                using CF32              = hipFloatComplex;
+                using CF32_Tuple        = ck::Tuple<CF32>;
+                using BilinearComplex   = element_wise::BilinearComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                      CF32,
                                                      PassThrough,
                                                      PassThrough,
-                                                     Bilinear>;
+                                                     BilinearComplex>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
+                                                                               BilinearComplex,
                                                                                CF32>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
index 5be10230..9d779671 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
@@ -47,9 +47,10 @@ namespace ck
         {
             namespace instance
             {
-                using F64        = double;
-                using CF64       = hipDoubleComplex;
-                using CF64_Tuple = ck::Tuple<CF64>;
+                using F64             = double;
+                using CF64            = hipDoubleComplex;
+                using CF64_Tuple      = ck::Tuple<CF64>;
+                using BilinearComplex = element_wise::BilinearComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
@@ -63,7 +64,7 @@ namespace ck
                                                          CF64,
                                                          PassThrough,
                                                          PassThrough,
-                                                         Bilinear>;
+                                                         BilinearComplex>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance(
@@ -76,7 +77,7 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
+                                                                               BilinearComplex,
                                                                                CF64>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
index bf5c1667..4197dda2 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F64        = double;
-                using CF64       = hipDoubleComplex;
-                using CF64_Tuple = ck::Tuple<CF64>;
+                using F64             = double;
+                using CF64            = hipDoubleComplex;
+                using CF64_Tuple      = ck::Tuple<CF64>;
+                using BilinearComplex = element_wise::BilinearComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                          CF64,
                                                          PassThrough,
                                                          PassThrough,
-                                                         Bilinear>;
+                                                         BilinearComplex>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
+                                                                               BilinearComplex,
                                                                                CF64>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
index e07e603e..cc519368 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F64        = double;
-                using CF64       = hipDoubleComplex;
-                using CF64_Tuple = ck::Tuple<CF64>;
+                using F64               = double;
+                using CF64              = hipDoubleComplex;
+                using CF64_Tuple        = ck::Tuple<CF64>;
+                using BilinearComplex   = element_wise::BilinearComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                          CF64,
                                                          PassThrough,
                                                          PassThrough,
-                                                         Bilinear>;
+                                                         BilinearComplex>;
 
                 void
                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
+                                                                               BilinearComplex,
                                                                                CF64>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
index 3329307a..ff187398 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F64        = double;
-                using CF64       = hipDoubleComplex;
-                using CF64_Tuple = ck::Tuple<CF64>;
+                using F64               = double;
+                using CF64              = hipDoubleComplex;
+                using CF64_Tuple        = ck::Tuple<CF64>;
+                using BilinearComplex   = element_wise::BilinearComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                          CF64,
                                                          PassThrough,
                                                          PassThrough,
-                                                         Bilinear>;
+                                                         BilinearComplex>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
+                                                                               BilinearComplex,
                                                                                CF64>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp
index 43a9358c..47b84e2c 100644
--- a/library/src/contraction/device/device_contraction_scale_complex.hpp
+++ b/library/src/contraction/device/device_contraction_scale_complex.hpp
@@ -44,7 +44,9 @@ namespace ck
             using hiptensor::elementSpaceFromLengthsAndStrides;
 
             using Bilinear          = ck::tensor_operation::element_wise::Bilinear;
+            using BilinearComplex   = ck::tensor_operation::element_wise::BilinearComplex;
             using Scale             = ck::tensor_operation::element_wise::Scale;
+            using ScaleComplex      = ck::tensor_operation::element_wise::ScaleComplex;
 
             // The following is a specialization class for bilinear contractions of complex types.
             // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
@@ -121,7 +123,7 @@ namespace ck
                 HIP_vector_type<EDataType, 2>,
                 AElementwiseOperation,
                 BElementwiseOperation,
-                Scale,
+                ScaleComplex,
                 GemmSpec,
                 NumGemmKPrefetchStage,
                 BlockSize,
@@ -164,15 +166,17 @@ namespace ck
                                                     HIP_vector_type<EDataType, 2>,
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
-                                                    Scale,
+                                                    ScaleComplex,
                                                     HIP_vector_type<ComputeDataType, 2>>
             {
                 // Complex device Op
                 using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
 
                 // CDE Operations
-                using ScaleCDEElementwiseOperation    = Scale;
-                using BilinearCDEElementwiseOperation = Bilinear;
+                using ScaleCDEElementwiseOperation          = ScaleComplex;
+                using DecompScaleCDEElementwiseOperation    = Scale;
+                using BilinearCDEElementwiseOperation       = BilinearComplex;
+                using DecompBilinearCDEElementwiseOperation = Bilinear;
 
                 // Complex types given through the interface
                 using ComplexA       = HIP_vector_type<ADataType, 2>;
@@ -214,7 +218,7 @@ namespace ck
                     DecompE,
                     AElementwiseOperation,
                     BElementwiseOperation,
-                    ScaleCDEElementwiseOperation,
+                    DecompScaleCDEElementwiseOperation,
                     GemmSpec,
                     NumGemmKPrefetchStage,
                     BlockSize,
@@ -262,7 +266,7 @@ namespace ck
                     DecompE,
                     AElementwiseOperation,
                     BElementwiseOperation,
-                    BilinearCDEElementwiseOperation,
+                    DecompBilinearCDEElementwiseOperation,
                     GemmSpec,
                     NumGemmKPrefetchStage,
                     BlockSize,
@@ -336,7 +340,7 @@ namespace ck
                              const std::vector<index_t>&                         e_ms_ns_strides,
                              AElementwiseOperation                               a_element_op,
                              BElementwiseOperation                               b_element_op,
-                             ScaleCDEElementwiseOperation                        cde_element_op)
+                             ScaleCDEElementwiseOperation                        cde_element_op) : element_op(cde_element_op)
                     {
                         // Take the incoming arguments, treat them as complex.
 
@@ -348,8 +352,6 @@ namespace ck
                         elementsE
                             = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
 
-                        element_op = cde_element_op;
-
                         mA_real.reset(nullptr);
                         mA_imag.reset(nullptr);
                         mB_real.reset(nullptr);
@@ -449,21 +451,21 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        mScaleArgs[0]    = allocScaleArgs(mE_real, mA_real, mB_real, ScaleCDEElementwiseOperation{1.0f});
+                        mScaleArgs[0]    = allocScaleArgs(mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f});
                         mBilinearArgs[0] = allocBilinearArgs(
                             mE_real,
                             mA_imag,
                             mB_imag,
                             mE_real,
-                            BilinearCDEElementwiseOperation{-1.0f, 1.0f});
+                            DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f});
 
-                        mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, ScaleCDEElementwiseOperation{1.0f});
+                        mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f});
                         mBilinearArgs[1] = allocBilinearArgs(
                             mE_imag,
                             mA_imag,
                             mB_real,
                             mE_imag,
-                            BilinearCDEElementwiseOperation{1.0f, 1.0f});
+                            DecompBilinearCDEElementwiseOperation{1.0f, 1.0f});
 
 
                         // TODO UNCOMMENT WHEN DONE
@@ -512,7 +514,7 @@ namespace ck
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
 
-                    ScaleCDEElementwiseOperation element_op{1.0};
+                    ScaleCDEElementwiseOperation element_op;
                     void* mE_grid;
                     index_t elementsE;
                 };
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
index 9e9c8f9a..3133f4cd 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
@@ -48,9 +48,10 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using CF32        = hipFloatComplex;
-                using Empty_Tuple = ck::Tuple<>;
+                using F32           = float;
+                using CF32          = hipFloatComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
@@ -64,7 +65,7 @@ namespace ck
                                                      CF32,
                                                      PassThrough,
                                                      PassThrough,
-                                                     Scale>;
+                                                     ScaleComplex>;
                 void
                     add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -76,7 +77,7 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
+                                                                               ScaleComplex,
                                                                                CF32>>>& instances)
                     {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
index b9183a21..b358be8a 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F32         = float;
-                using CF32        = hipFloatComplex;
-                using Empty_Tuple = ck::Tuple<>;
+                using F32           = float;
+                using CF32          = hipFloatComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                      CF32,
                                                      PassThrough,
                                                      PassThrough,
-                                                     Scale>;
+                                                     ScaleComplex>;
 
                 void
                     add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
+                                                                               ScaleComplex,
                                                                                CF32>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
index 1f87031d..359a074a 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F32         = float;
-                using CF32        = hipFloatComplex;
-                using Empty_Tuple = ck::Tuple<>;
+                using F32           = float;
+                using CF32          = hipFloatComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                      CF32,
                                                      PassThrough,
                                                      PassThrough,
-                                                     Scale>;
+                                                     ScaleComplex>;
 
                 void
                     add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
+                                                                               ScaleComplex,
                                                                                CF32>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
index ef7724e0..4cc8659d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F32         = float;
-                using CF32        = hipFloatComplex;
-                using Empty_Tuple = ck::Tuple<>;
+                using F32           = float;
+                using CF32          = hipFloatComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                      CF32,
                                                      PassThrough,
                                                      PassThrough,
-                                                     Scale>;
+                                                     ScaleComplex>;
 
                 void
                     add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
+                                                                               ScaleComplex,
                                                                                CF32>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
index e22aab5f..1cac8ebb 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
@@ -48,9 +48,10 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using CF64        = hipDoubleComplex;
-                using Empty_Tuple = ck::Tuple<>;
+                using F64           = double;
+                using CF64          = hipDoubleComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
@@ -64,7 +65,7 @@ namespace ck
                                                          CF64,
                                                          PassThrough,
                                                          PassThrough,
-                                                         Scale>;
+                                                         ScaleComplex>;
 
                 void
                     add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance(
@@ -77,7 +78,7 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
+                                                                               ScaleComplex,
                                                                                CF64>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
index 58ed790a..e60bbd61 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F64         = double;
-                using CF64        = hipDoubleComplex;
-                using Empty_Tuple = ck::Tuple<>;
+                using F64           = double;
+                using CF64          = hipDoubleComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                          CF64,
                                                          PassThrough,
                                                          PassThrough,
-                                                         Scale>;
+                                                         ScaleComplex>;
 
                 void
                     add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
+                                                                               ScaleComplex,
                                                                                CF64>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
index 562519f5..e44d24e1 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F64         = double;
-                using CF64        = hipDoubleComplex;
-                using Empty_Tuple = ck::Tuple<>;
+                using F64           = double;
+                using CF64          = hipDoubleComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                          CF64,
                                                          PassThrough,
                                                          PassThrough,
-                                                         Scale>;
+                                                         ScaleComplex>;
 
                 void
                     add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
+                                                                               ScaleComplex,
                                                                                CF64>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
index 724d89cf..dee9ce39 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
@@ -46,9 +46,10 @@ namespace ck
         {
             namespace instance
             {
-                using F64         = double;
-                using CF64        = hipDoubleComplex;
-                using Empty_Tuple = ck::Tuple<>;
+                using F64           = double;
+                using CF64          = hipDoubleComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
 
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
@@ -62,7 +63,7 @@ namespace ck
                                                          CF64,
                                                          PassThrough,
                                                          PassThrough,
-                                                         Scale>;
+                                                         ScaleComplex>;
 
                 void
                     add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance(
@@ -75,7 +76,7 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
+                                                                               ScaleComplex,
                                                                                CF64>>>& instances)
                 {
                     add_device_operation_instances(
diff --git a/library/src/contraction/device/device_element_wise_complex.hpp b/library/src/contraction/device/device_element_wise_operation_complex.hpp
similarity index 87%
rename from library/src/contraction/device/device_element_wise_complex.hpp
rename to library/src/contraction/device/device_element_wise_operation_complex.hpp
index 6dfd94e0..a01ced36 100644
--- a/library/src/contraction/device/device_element_wise_complex.hpp
+++ b/library/src/contraction/device/device_element_wise_operation_complex.hpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#ifndef HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP
-#define HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP
+#ifndef HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP
+#define HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP
 
 #include <unary_element_wise_operation.hpp>
 #include <binary_element_wise_operation.hpp>
@@ -37,11 +37,6 @@ namespace element_wise {
 
 struct ScaleComplex : public Scale
 {
-    __host__ __device__ ScaleComplex(hipFloatComplex scale) : Scale(hipCrealf(scale))
-    {
-        scale_ = hipComplexFloatToDouble(scale);
-    }
-
     __host__ __device__ ScaleComplex(hipDoubleComplex scale) : Scale(hipCreal(scale))
     {
         scale_ = scale;
@@ -68,12 +63,6 @@ struct ScaleComplex : public Scale
 
 struct BilinearComplex : public Bilinear
 {
-    BilinearComplex(hipFloatComplex alpha, hipFloatComplex beta) : Bilinear(hipCrealf(alpha), hipCrealf(beta))
-    {
-        alpha_ = hipComplexFloatToDouble(alpha);
-        beta_  = hipComplexFloatToDouble(beta);
-    }
-
     BilinearComplex(hipDoubleComplex alpha, hipDoubleComplex beta) : Bilinear(hipCreal(alpha), hipCreal(beta))
     {
         alpha_  = alpha;
@@ -105,4 +94,4 @@ struct BilinearComplex : public Bilinear
 } // namespace tensor_operation
 } // namespace ck
 
-#endif // HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP
+#endif // HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP
diff --git a/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
index eac0f117..81d7edf5 100644
--- a/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
+++ b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
@@ -44,9 +44,11 @@ namespace ck
                 using F64        = double;
                 using CF64       = hipDoubleComplex;
                 using CF64_Tuple = ck::Tuple<CF64>;
- 
+
+                using BilinearComplex = element_wise::BilinearComplex;
+
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -56,11 +58,11 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
-                                                                               F32>>>& instances);
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -70,11 +72,11 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
-                                                                               F32>>>& instances);
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -84,11 +86,11 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
-                                                                               F32>>>& instances);
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -98,12 +100,12 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
-                                                                               F32>>>& instances);
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances);
 
                 // double
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -113,11 +115,11 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
-                                                                               F64>>>& instances);
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -127,11 +129,11 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
-                                                                               F64>>>& instances);
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -141,11 +143,11 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
-                                                                               F64>>>& instances);
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -155,8 +157,8 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear,
-                                                                               F64>>>& instances);
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances);
 
                 // Contraction + Bilinear
                 template <index_t NumDimM,
@@ -178,8 +180,8 @@ namespace ck
                         HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Bilinear,
-                        ComputeDataT>>
+                        ck::tensor_operation::element_wise::BilinearComplex,
+                        HIP_vector_type<ComputeDataT, 2>>>
                 {
                     using DeviceOp = DeviceContractionMultipleD<
                         NumDimM,
@@ -191,8 +193,8 @@ namespace ck
                         HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Bilinear,
-                        ComputeDataT>;
+                        ck::tensor_operation::element_wise::BilinearComplex,
+                        HIP_vector_type<ComputeDataT, 2>>;
 
                     static auto GetInstances()
                     {
@@ -203,13 +205,13 @@ namespace ck
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance(
                                     op_ptrs);
                             }
                         }
@@ -220,13 +222,13 @@ namespace ck
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance(
                                     op_ptrs);
                             }
                         }
diff --git a/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
index fff9dca6..705ac6c0 100644
--- a/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
+++ b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
@@ -44,8 +44,10 @@ namespace ck
                 using F64  = double;
                 using CF64 = hipDoubleComplex;
 
+                using ScaleComplex = element_wise::ScaleComplex;
+
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -55,11 +57,11 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
-                                                                               F32>>>& instances);
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -69,11 +71,11 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
-                                                                               F32>>>& instances);
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -83,11 +85,11 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
-                                                                               F32>>>& instances);
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -97,11 +99,11 @@ namespace ck
                                                                                CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
-                                                                               F32>>>& instances);
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -111,11 +113,11 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
-                                                                               F64>>>& instances);
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -125,11 +127,11 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
-                                                                               F64>>>& instances);
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -139,11 +141,11 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
-                                                                               F64>>>& instances);
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance(
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
@@ -153,8 +155,8 @@ namespace ck
                                                                                CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Scale,
-                                                                               F64>>>& instances);
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances);
  
                 // Contraction + Scale
                 template <index_t NumDimM,
@@ -175,8 +177,8 @@ namespace ck
                         HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Scale,
-                        ComputeDataType>>
+                        ck::tensor_operation::element_wise::ScaleComplex,
+                        HIP_vector_type<ComputeDataType, 2>>>
                 {
                     using DeviceOp = DeviceContractionMultipleD<
                         NumDimM,
@@ -188,8 +190,8 @@ namespace ck
                         HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Scale,
-                        ComputeDataType>;
+                        ck::tensor_operation::element_wise::ScaleComplex,
+                        HIP_vector_type<ComputeDataType, 2>>;
 
                     static auto GetInstances()
                     {
@@ -200,13 +202,13 @@ namespace ck
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance(
                                     op_ptrs);
                             }
                         }
@@ -216,13 +218,13 @@ namespace ck
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance(
                                     op_ptrs);
                             }
                         }
diff --git a/library/src/include/meta_traits.hpp b/library/src/include/meta_traits.hpp
index 0e039cd6..2cd0d740 100644
--- a/library/src/include/meta_traits.hpp
+++ b/library/src/include/meta_traits.hpp
@@ -32,7 +32,7 @@ namespace hiptensor
 
     // Placeholder for building traits on any type T
     // Use partial or full specialization for any class.
-    template <typename T>
+    template <typename T, typename Enabler = void>
     struct MetaTraits;
 
 } // namespace hiptensor

From cf233d82a08ebb0618bf8ca178388d754a87acf6 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Thu, 21 Dec 2023 22:45:45 +0000
Subject: [PATCH 38/42] Add unit test of contraction with complex compute type

- Fix issue in yaml_test
- Fix a bug in hiptensorInitContractionDescriptor
---
 .../src/contraction/contraction_selection.cpp | 19 ++---
 .../src/contraction/hiptensor_contraction.cpp | 12 ++-
 library/src/data_types.cpp                    | 79 +++++++------------
 library/src/include/data_types.hpp            | 42 +++++-----
 test/00_unit/yaml_test.cpp                    |  8 +-
 .../configs/bilinear_test_params.yaml         | 14 ++--
 .../configs/complex_bilinear_test_params.yaml | 23 ++----
 .../configs/scale_test_params.yaml            | 14 ++--
 test/01_contraction/contraction_test.cpp      | 56 +++++++------
 .../contraction_test_params.hpp               |  4 +-
 test/llvm/yaml_parser_config.cpp              | 14 +++-
 11 files changed, 133 insertions(+), 152 deletions(-)

diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index b2e54d80..1f7b70a6 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -81,22 +81,17 @@ namespace hiptensor
          * ```
          * Hence, the `alpha` and `bete` need to point to a ComputeData value
          */
-        hipDoubleComplex alpha;
-        hipDoubleComplex beta;
-        if(computeType == HIPTENSOR_COMPUTE_C32F)
+        ScalarData alpha;
+        ScalarData beta;
+        if(computeType == HIPTENSOR_COMPUTE_C32F || computeType == HIPTENSOR_COMPUTE_C64F)
         {
-            writeVal(&alpha, computeType, hipFloatComplex{1.02, 1.03});
-            writeVal(&beta, computeType, hipFloatComplex{1.04, 1.05});
-        }
-        else if(computeType == HIPTENSOR_COMPUTE_C64F)
-        {
-            writeVal(&alpha, computeType, hipDoubleComplex{1.02, 1.03});
-            writeVal(&beta, computeType, hipDoubleComplex{1.04, 1.05});
+            writeVal(&alpha, computeType, {computeType, 1.02, 1.03});
+            writeVal(&beta, computeType, {computeType, 1.04, 1.05});
         }
         else
         {
-            writeVal(&alpha, computeType, 1.02);
-            writeVal(&beta, computeType, 1.03);
+            writeVal(&alpha, computeType, ScalarData(computeType, 1.02));
+            writeVal(&beta, computeType, ScalarData(computeType, 1.03));
         }
 
         CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA));
diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index 8148eeaa..d063ebf5 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -147,7 +147,11 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t*
         // Use a scale contraction due to
         // tensor C-descriptor is empty
 
-        *desc = {(int32_t)hiptensor::ContractionOpId_t::SCALE,
+        auto contractionOp
+            = typeCompute == HIPTENSOR_COMPUTE_C32F || typeCompute == HIPTENSOR_COMPUTE_C64F
+                  ? hiptensor::ContractionOpId_t::SCALE_COMPLEX
+                  : hiptensor::ContractionOpId_t::SCALE;
+        *desc = {(int32_t)contractionOp,
                  typeCompute,
                  {*descA,
                   *descB,
@@ -161,7 +165,11 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t*
     {
         // Use a bilinear contraction due to
         // tensor C-descriptor is not empty
-        *desc = {(int32_t)hiptensor::ContractionOpId_t::BILINEAR,
+        auto contractionOp
+            = typeCompute == HIPTENSOR_COMPUTE_C32F || typeCompute == HIPTENSOR_COMPUTE_C64F
+                  ? hiptensor::ContractionOpId_t::BILINEAR_COMPLEX
+                  : hiptensor::ContractionOpId_t::BILINEAR;
+        *desc = {(int32_t)contractionOp,
                  typeCompute,
                  {*descA, *descB, *descC, *descD},
                  {alignmentRequirementA,
diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp
index abaf7154..69e29b50 100644
--- a/library/src/data_types.cpp
+++ b/library/src/data_types.cpp
@@ -153,130 +153,105 @@ namespace hiptensor
     {
         if(id == HIPTENSOR_COMPUTE_16F)
         {
-            return ScalarData(*(_Float16*)value, id);
+            return ScalarData(id, *(_Float16*)value);
         }
         else if(id == HIPTENSOR_COMPUTE_16BF)
         {
-            return ScalarData(*(hip_bfloat16*)value, id);
+            return ScalarData(id, *(hip_bfloat16*)value);
         }
         else if(id == HIPTENSOR_COMPUTE_32F)
         {
-            return ScalarData(*(float*)value, id);
+            return ScalarData(id, *(float*)value);
         }
         else if(id == HIPTENSOR_COMPUTE_64F)
         {
-            return ScalarData(*(double*)value, id);
+            return ScalarData(id, *(double*)value);
         }
         else if(id == HIPTENSOR_COMPUTE_8U)
         {
-            return ScalarData(*(uint8_t*)value, id);
+            return ScalarData(id, *(uint8_t*)value);
         }
         else if(id == HIPTENSOR_COMPUTE_8I)
         {
-            return ScalarData(*(int8_t*)value, id);
+            return ScalarData(id, *(int8_t*)value);
         }
         else if(id == HIPTENSOR_COMPUTE_32U)
         {
-            return ScalarData(*(uint32_t*)value, id);
+            return ScalarData(id, *(uint32_t*)value);
         }
         else if(id == HIPTENSOR_COMPUTE_32I)
         {
-            return ScalarData(*(int32_t*)value, id);
+            return ScalarData(id, *(int32_t*)value);
         }
         else if(id == HIPTENSOR_COMPUTE_C32F)
         {
-            return {*(hipFloatComplex*)value, id};
+            auto complex = *(hipFloatComplex*)value;
+            return {id, complex.x, complex.y};
         }
         else if(id == HIPTENSOR_COMPUTE_C64F)
         {
-            return {*(hipDoubleComplex*)value, id};
+            auto complex = *(hipDoubleComplex*)value;
+            return {id, complex.x, complex.y};
         }
         else
         {
 #if !NDEBUG
             std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
 #endif // !NDEBUG
-            return {0, HIPTENSOR_COMPUTE_NONE};
+            return {HIPTENSOR_COMPUTE_NONE, 0, 0};
         }
     }
 
-    void writeVal(void const* addr, hiptensorComputeType_t id, double value)
+    void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value)
     {
         if(id == HIPTENSOR_COMPUTE_16F)
         {
-            *(_Float16*)addr = value;
+            *(_Float16*)addr = value.mReal;
         }
         else if(id == HIPTENSOR_COMPUTE_16BF)
         {
-            *(hip_bfloat16*)addr = value;
+            *(hip_bfloat16*)addr = value.mReal;
         }
         else if(id == HIPTENSOR_COMPUTE_32F)
         {
-            *(float*)addr = value;
+            *(float*)addr = value.mReal;
         }
         else if(id == HIPTENSOR_COMPUTE_64F)
         {
-            *(double*)addr = value;
+            *(double*)addr = value.mReal;
         }
         else if(id == HIPTENSOR_COMPUTE_8U)
         {
-            *(uint8_t*)addr = value;
+            *(uint8_t*)addr = (uint8_t)value.mReal;
         }
         else if(id == HIPTENSOR_COMPUTE_8I)
         {
-            *(int8_t*)addr = value;
+            *(int8_t*)addr = (int8_t)value.mReal;
         }
         else if(id == HIPTENSOR_COMPUTE_32U)
         {
-            *(uint32_t*)addr = value;
+            *(uint32_t*)addr = (uint32_t)value.mReal;
         }
         else if(id == HIPTENSOR_COMPUTE_32I)
         {
-            *(int32_t*)addr = value;
+            *(int32_t*)addr = (int32_t)value.mReal;
         }
-        else
-        {
-#if !NDEBUG
-            std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
-#endif // !NDEBUG
-            return;
-        }
-    }
-
-    void writeVal(void const* addr, hiptensorComputeType_t id, hipFloatComplex value)
-    {
-        if(id == HIPTENSOR_COMPUTE_C32F)
-        {
-            *(hipFloatComplex*)addr = value;
-        }
-        else
+        else if(id == HIPTENSOR_COMPUTE_C32F)
         {
-#if !NDEBUG
-            std::cout << "Data type is hipFloatComplex, but hiptensorComputeType_t is not "
-                         "HIPTENSOR_COMPUTE_C32F: "
-                      << id << std::endl;
-#endif // !NDEBUG
-            return;
+            *(hipFloatComplex*)addr = hipComplexDoubleToFloat(value.mComplex);
         }
-    }
-
-    void writeVal(void const* addr, hiptensorComputeType_t id, hipDoubleComplex value)
-    {
-        if(id == HIPTENSOR_COMPUTE_C64F)
+        else if(id == HIPTENSOR_COMPUTE_C64F)
         {
-            *(hipDoubleComplex*)addr = value;
+            *(hipDoubleComplex*)addr = value.mComplex;
         }
         else
         {
 #if !NDEBUG
-            std::cout << "Data type is hipDoubleComplex, but hiptensorComputeType_t is not "
-                         "HIPTENSOR_COMPUTE_C64F: "
-                      << id << std::endl;
+            std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
 #endif // !NDEBUG
             return;
         }
     }
-
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType)
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
index aa2eaa40..900b2069 100644
--- a/library/src/include/data_types.hpp
+++ b/library/src/include/data_types.hpp
@@ -46,44 +46,41 @@ namespace hiptensor
 
     struct ScalarData
     {
-        hiptensorComputeType_t type;
+        hiptensorComputeType_t mType;
         union
         {
-            double           real;
-            hipDoubleComplex complex;
+            double           mReal;
+            hipDoubleComplex mComplex;
         };
 
         ScalarData() = default;
-        ScalarData(double value, hiptensorComputeType_t type)
-            : real(value)
-            , type(type)
-        {
-        }
-        ScalarData(hipFloatComplex value, hiptensorComputeType_t type)
-            : complex(hipComplexFloatToDouble(value))
-            , type(type)
-        {
-        }
-        ScalarData(hipDoubleComplex value, hiptensorComputeType_t type)
-            : complex(value)
-            , type(type)
+        ScalarData(hiptensorComputeType_t type, double real, double imag = 0)
         {
+            mType = type;
+            if(type == HIPTENSOR_COMPUTE_C32F || type == HIPTENSOR_COMPUTE_C64F)
+            {
+                mComplex = make_hipDoubleComplex(real, imag);
+            }
+            else
+            {
+                mReal = real;
+            }
         }
         operator float() const
         {
-            return static_cast<float>(real);
+            return static_cast<float>(mReal);
         }
         operator double() const
         {
-            return real;
+            return mReal;
         }
         operator hipFloatComplex() const
         {
-            return hipComplexDoubleToFloat(complex);
+            return hipComplexDoubleToFloat(mComplex);
         }
         operator hipDoubleComplex() const
         {
-            return complex;
+            return mComplex;
         }
     };
 
@@ -109,10 +106,7 @@ namespace hiptensor
     template <typename T>
     T readVal(void const* value, hiptensorComputeType_t id);
 
-    void writeVal(void const* addr, hiptensorComputeType_t id, double value);
-    void writeVal(void const* addr, hiptensorComputeType_t id, hipDoubleComplex value);
-    void writeVal(void const* addr, hiptensorComputeType_t id, hipFloatComplex value);
-
+    void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value);
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);
diff --git a/test/00_unit/yaml_test.cpp b/test/00_unit/yaml_test.cpp
index 372fbbdd..57a86a25 100644
--- a/test/00_unit/yaml_test.cpp
+++ b/test/00_unit/yaml_test.cpp
@@ -54,8 +54,8 @@ namespace hiptensor
 
         using LengthsT = std::vector<std::size_t>;
         using StridesT = std::vector<std::size_t>;
-        using AlphaT   = double;
-        using BetaT    = double;
+        using AlphaT   = std::vector<double>;
+        using BetaT    = std::vector<double>;
 
         //Data types of input and output tensors
         std::vector<TestTypesT>    mDataTypes;
@@ -98,8 +98,8 @@ int main(int argc, char* argv[])
     yee.mProblemLengths
         = {{5, 6, 7, 8, 4, 2, 3, 4}, {1, 2, 3, 4}, {99, 12, 44, 31, 59, 23, 54, 22}};
     yee.mProblemStrides = {{}};
-    yee.mAlphas         = {0, 1, 1};
-    yee.mBetas          = {2, 2, 2};
+    yee.mAlphas         = {{0}, {1}, {1}};
+    yee.mBetas          = {{2}, {2}, {2}};
 
     struct TmpFileWrapper
     {
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index cbaee86a..1e7999fc 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -8,8 +8,6 @@ Tensor Data Types:
   - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ]
   - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ]
   - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ]
-  - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_R_32F ]
-  - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_R_64F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
@@ -21,13 +19,13 @@ Worksize Prefs:
   - HIPTENSOR_WORKSPACE_MIN
   - HIPTENSOR_WORKSPACE_MAX
 Alphas:
-  - 0
-  - 1
-  - 1
+  - [0]
+  - [1]
+  - [1]
 Betas:
-  - 2
-  - 0
-  - 2
+  - [2]
+  - [0]
+  - [2]
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml
index cbaee86a..0d59c05d 100644
--- a/test/01_contraction/configs/complex_bilinear_test_params.yaml
+++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml
@@ -1,15 +1,8 @@
 ---
 Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
-  - [ HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_32F ]
-  - [ HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_32F ]
-  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F ]
-  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16F ]
-  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ]
-  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ]
-  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ]
-  - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_R_32F ]
-  - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_R_64F ]
+  - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F ]
+  - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
@@ -21,13 +14,13 @@ Worksize Prefs:
   - HIPTENSOR_WORKSPACE_MIN
   - HIPTENSOR_WORKSPACE_MAX
 Alphas:
-  - 0
-  - 1
-  - 1
+  - [0, 0]
+  - [1, 1]
+  - [1, 1]
 Betas:
-  - 2
-  - 0
-  - 2
+  - [2, 2]
+  - [0, 0]
+  - [2, 2]
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index 4e640034..bc8289f5 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -8,8 +8,6 @@ Tensor Data Types:
   - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ]
-  - [ HIP_C_32F, HIP_C_32F, NONE_TYPE, HIP_C_32F, HIP_R_32F ]
-  - [ HIP_C_64F, HIP_C_64F, NONE_TYPE, HIP_C_64F, HIP_R_64F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
@@ -21,13 +19,13 @@ Worksize Prefs:
   - HIPTENSOR_WORKSPACE_MIN
   - HIPTENSOR_WORKSPACE_MAX
 Alphas:
-  - 0
-  - 1
-  - 1
+  - [0]
+  - [1]
+  - [1]
 Betas:
-  - 2
-  - 0
-  - 2
+  - [2]
+  - [0]
+  - [2]
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 2059fd73..a75cf7bf 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -57,7 +57,8 @@ namespace hiptensor
     bool ContractionTest::checkDevice(hipDataType datatype) const
     {
         return (isF32Supported()
-                && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF || datatype == HIP_C_32F))
+                && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF
+                    || datatype == HIP_C_32F))
                || (isF64Supported() && (datatype == HIP_R_64F || datatype == HIP_C_64F));
     }
 
@@ -131,7 +132,8 @@ namespace hiptensor
                     || (DDataType == HIP_C_32F) || (DDataType == HIP_C_64F));
         EXPECT_TRUE(
             (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF)
-            || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F));
+            || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F)
+            || (computeType == HIPTENSOR_COMPUTE_C32F) || (computeType == HIPTENSOR_COMPUTE_C64F));
 
         mRunFlag &= checkDevice(DDataType);
 
@@ -297,28 +299,36 @@ namespace hiptensor
             else if(ADataType == HIP_C_32F && BDataType == HIP_C_32F && DDataType == HIP_C_32F)
             {
                 // Initialize matrix data on device
-                fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceA().get(), elementsA);
-                fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceB().get(), elementsB);
+                fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceA().get(),
+                                                  elementsA);
+                fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceB().get(),
+                                                  elementsB);
                 if(CDataType == HIP_C_32F)
                 {
-                    fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceC().get(), elementsCD);
+                    fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceC().get(),
+                                                      elementsCD);
                 }
-                fillValLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceD().get(),
-                                            elementsCD,
-                                            std::numeric_limits<hipFloatComplex>::signaling_NaN());
+                fillValLaunchKernel<hipFloatComplex>(
+                    (hipFloatComplex*)resource->deviceD().get(),
+                    elementsCD,
+                    std::numeric_limits<hipFloatComplex>::signaling_NaN());
             }
-           else if(ADataType == HIP_C_64F && BDataType == HIP_C_64F && DDataType == HIP_C_64F)
+            else if(ADataType == HIP_C_64F && BDataType == HIP_C_64F && DDataType == HIP_C_64F)
             {
                 // Initialize matrix data on device
-                fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceA().get(), elementsA);
-                fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceB().get(), elementsB);
+                fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceA().get(),
+                                                   elementsA);
+                fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceB().get(),
+                                                   elementsB);
                 if(CDataType == HIP_C_64F)
                 {
-                    fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceC().get(), elementsCD);
+                    fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceC().get(),
+                                                       elementsCD);
                 }
-                fillValLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceD().get(),
-                                            elementsCD,
-                                            std::numeric_limits<hipDoubleComplex>::signaling_NaN());
+                fillValLaunchKernel<hipDoubleComplex>(
+                    (hipDoubleComplex*)resource->deviceD().get(),
+                    elementsCD,
+                    std::numeric_limits<hipDoubleComplex>::signaling_NaN());
             }
 
             resource->copyDeviceToHostAll(elementBytes);
@@ -515,7 +525,8 @@ namespace hiptensor
                     stream << std::endl;
 
                     stream << "Tensor D elements:\n";
-                    hiptensorPrintArrayElements<hipFloatComplex>(stream, (hipFloatComplex*)D.get(), elementsCD);
+                    hiptensorPrintArrayElements<hipFloatComplex>(
+                        stream, (hipFloatComplex*)D.get(), elementsCD);
                     stream << std::endl;
                 }
                 else if(DDataType == HIP_C_64F)
@@ -536,7 +547,8 @@ namespace hiptensor
                     stream << std::endl;
 
                     stream << "Tensor D elements:\n";
-                    hiptensorPrintArrayElements<hipDoubleComplex>(stream, (hipDoubleComplex*)D.get(), elementsCD);
+                    hiptensorPrintArrayElements<hipDoubleComplex>(
+                        stream, (hipDoubleComplex*)D.get(), elementsCD);
                     stream << std::endl;
                 }
             }
@@ -573,10 +585,10 @@ namespace hiptensor
              * ```
              * Hence, the `alpha` and `bete` need to point to a ComputeData value
              */
-            double alphaBuf = 0.;
-            double betaBuf  = 0.;
-            writeVal(&alphaBuf, computeType, alpha);
-            writeVal(&betaBuf, computeType, beta);
+            ScalarData alphaBuf;
+            ScalarData betaBuf;
+            writeVal(&alphaBuf, computeType, ScalarData(computeType, alpha[0], alpha[1]));
+            writeVal(&betaBuf, computeType, ScalarData(computeType, beta[0], beta[1]));
 
             CHECK_HIPTENSOR_ERROR(
                 hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
@@ -643,7 +655,7 @@ namespace hiptensor
                 std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<float>(
                     (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD);
             }
-            else if(DDataType ==  HIP_R_64F || DDataType == HIP_C_64F)
+            else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F)
             {
                 std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<double>(
                     (double*)resource->deviceD().get(), (double*)reference.get(), elementsCD);
diff --git a/test/01_contraction/contraction_test_params.hpp b/test/01_contraction/contraction_test_params.hpp
index 29c4aa1b..4db4ebc1 100644
--- a/test/01_contraction/contraction_test_params.hpp
+++ b/test/01_contraction/contraction_test_params.hpp
@@ -49,8 +49,8 @@ namespace hiptensor
 
         using LengthsT = std::vector<std::size_t>;
         using StridesT = std::vector<std::size_t>;
-        using AlphaT   = double;
-        using BetaT    = double;
+        using AlphaT   = std::vector<double>;
+        using BetaT    = std::vector<double>;
 
     public:
         std::vector<TestTypesT>& dataTypes()
diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp
index 5c674045..8b504b01 100644
--- a/test/llvm/yaml_parser_config.cpp
+++ b/test/llvm/yaml_parser_config.cpp
@@ -92,6 +92,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(hiptensorOperator_t)
 LLVM_YAML_IS_SEQUENCE_VECTOR(hiptensorWorksizePreference_t)
 LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<hipDataType>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<std::size_t>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<double>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(AlphaT)
 LLVM_YAML_IS_SEQUENCE_VECTOR(BetaT)
 
@@ -229,10 +230,10 @@ namespace llvm
                 io.mapRequired("Algorithm Types", doc.algorithms());
                 io.mapRequired("Operators", doc.operators());
                 io.mapRequired("Worksize Prefs", doc.workSizePrefrences());
-                io.mapRequired("Alphas", (std::vector<AlphaT>&)(doc.alphas()));
+                io.mapOptional("Alphas", (std::vector<std::vector<double>>&)(doc.alphas()));
                 io.mapOptional("Betas",
-                               (std::vector<BetaT>&)(doc.betas()),
-                               std::vector<BetaT>(doc.alphas().size(), BetaT(0)));
+                               (std::vector<std::vector<double>>&)(doc.betas()),
+                               std::vector<std::vector<double>>(doc.alphas().size()));
                 io.mapRequired("Lengths", doc.problemLengths());
 
                 // Default values for optional values
@@ -259,6 +260,13 @@ namespace llvm
                     return "Error: Empty Alphas";
                 }
 
+                if(std::any_of(doc.alphas().cbegin(), doc.alphas().cend(), [](auto&& alpha) {
+                       return alpha.size() > 2 || alpha.size() <= 0;
+                   }))
+                {
+                    return "Error: invalid Alpha";
+                }
+
                 if(doc.betas().size() > 0 && doc.betas().size() != doc.alphas().size())
                 {
                     return "Error: Alphas and betas must have same size";

From c01eda7cadbfc790370440cd7413928b01af317a Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Fri, 22 Dec 2023 12:06:21 -0500
Subject: [PATCH 39/42] Modify samples to use new compute type

- Fix bug in samples
- Add unit tests for scale contraction
---
 samples/01_contraction/CMakeLists.txt         |  4 +-
 .../simple_bilinear_contraction.hpp           | 15 ++++--
 ...tion_cf32_cf32_cf32_cf32_compute_cf32.cpp} |  6 +--
 .../simple_scale_contraction.hpp              | 13 ++++-
 ...ntraction_cf32_cf32_cf32_compute_cf32.cpp} |  6 +--
 test/01_contraction/CMakeLists.txt            |  6 ++-
 .../complex_scale_contraction_test.cpp        | 48 +++++++++++++++++++
 .../configs/complex_scale_test_params.yaml    | 30 ++++++++++++
 8 files changed, 114 insertions(+), 14 deletions(-)
 rename samples/01_contraction/{simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp => simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp} (95%)
 rename samples/01_contraction/{simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp => simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp} (95%)
 create mode 100644 test/01_contraction/complex_scale_contraction_test.cpp
 create mode 100644 test/01_contraction/configs/complex_scale_test_params.yaml

diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index c51a2dbc..d255c0e4 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -31,7 +31,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
     add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp)
     add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp)
     add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp)
-    add_hiptensor_sample(simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32 simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32 simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp)
     add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp)
     add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp)
     add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp)
@@ -39,7 +39,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
     add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp)
     add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp)
     add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp)
-    add_hiptensor_sample(simple_scale_contraction_cf32_cf32_cf32_compute_f32 simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_cf32_cf32_cf32_compute_cf32 simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp)
     add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp)
     add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp)
 
diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp
index 27001232..b0348b91 100644
--- a/samples/01_contraction/simple_bilinear_contraction.hpp
+++ b/samples/01_contraction/simple_bilinear_contraction.hpp
@@ -37,15 +37,24 @@
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          typename floatTypeCompute,
+          typename computeDataType,
           hipDataType            typeA,
           hipDataType            typeB,
           hipDataType            typeC,
           hiptensorComputeType_t typeCompute>
 int bilinearContractionSample()
 {
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
-    floatTypeCompute beta  = (floatTypeCompute)1.0f;
+    computeDataType alpha, beta;
+    if constexpr(std::is_same_v<computeDataType, hipFloatComplex> || std::is_same_v<computeDataType, hipDoubleComplex>)
+    {
+        alpha = computeDataType(1.0, 1.0);
+        beta = computeDataType(1.0, 1.0);
+    }
+    else
+    {
+        alpha = (computeDataType)1.0f;
+        beta = (computeDataType)1.0f;
+    }
 
     /**********************
    * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
diff --git a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp
similarity index 95%
rename from samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp
rename to samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp
index 25392592..648675f6 100644
--- a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp
@@ -39,17 +39,17 @@ int main(int argc, char* argv[])
     typedef hipFloatComplex ADataType;
     typedef hipFloatComplex BDataType;
     typedef hipFloatComplex CDataType;
-    typedef float floatTypeCompute;
+    typedef hipFloatComplex ComputeDataType;
 
     constexpr hipDataType            typeA       = HIP_C_32F;
     constexpr hipDataType            typeB       = HIP_C_32F;
     constexpr hipDataType            typeC       = HIP_C_32F;
-    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F;
 
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     floatTypeCompute,
+                                     ComputeDataType,
                                      typeA,
                                      typeB,
                                      typeC,
diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp
index 78b026b6..45914e30 100644
--- a/samples/01_contraction/simple_scale_contraction.hpp
+++ b/samples/01_contraction/simple_scale_contraction.hpp
@@ -37,14 +37,23 @@
 template <typename ADataType,
           typename BDataType,
           typename DDataType,
-          typename floatTypeCompute,
+          typename computeDataType,
           hipDataType            typeA,
           hipDataType            typeB,
           hipDataType            typeD,
           hiptensorComputeType_t typeCompute>
 int scaleContractionSample()
 {
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
+    computeDataType alpha;
+    if constexpr(std::is_same_v<computeDataType, hipFloatComplex> || std::is_same_v<computeDataType, hipDoubleComplex>)
+    {
+        alpha = computeDataType(1.0, 1.0);
+    }
+    else
+    {
+        alpha = (computeDataType)1.0f;
+    }
+
     /**********************
    * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
    **********************/
diff --git a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp
similarity index 95%
rename from samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp
rename to samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp
index 7fc5c3a3..0f6eaac3 100644
--- a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp
+++ b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp
@@ -39,17 +39,17 @@ int main(int argc, char* argv[])
     typedef hipFloatComplex ADataType;
     typedef hipFloatComplex BDataType;
     typedef hipFloatComplex DDataType;
-    typedef float    floatTypeCompute;
+    typedef hipFloatComplex ComputeDataType;
 
     constexpr hipDataType            typeA       = HIP_C_32F;
     constexpr hipDataType            typeB       = HIP_C_32F;
     constexpr hipDataType            typeD       = HIP_C_32F;
-    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F;
 
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  floatTypeCompute,
+                                  ComputeDataType,
                                   typeA,
                                   typeB,
                                   typeD,
diff --git a/test/01_contraction/CMakeLists.txt b/test/01_contraction/CMakeLists.txt
index 1e0e3c0a..a59eeefd 100644
--- a/test/01_contraction/CMakeLists.txt
+++ b/test/01_contraction/CMakeLists.txt
@@ -45,4 +45,8 @@ set (ScaleContractionTestSources ${ContractionCommonSources}
 set (ScaleContractionTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/scale_test_params.yaml)
 add_hiptensor_test(scale_contraction_test ${ScaleContractionTestConfig} ${ScaleContractionTestSources})
 
-
+# Complex Scale tests
+set (ComplexScaleContractionTestSources ${ContractionCommonSources}
+    ${CMAKE_CURRENT_SOURCE_DIR}/complex_scale_contraction_test.cpp)
+set (ComplexScaleContractionTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/complex_scale_test_params.yaml)
+add_hiptensor_test(complex_scale_contraction_test ${ComplexScaleContractionTestConfig}  ${ComplexScaleContractionTestSources})
diff --git a/test/01_contraction/complex_scale_contraction_test.cpp b/test/01_contraction/complex_scale_contraction_test.cpp
new file mode 100644
index 00000000..3995651b
--- /dev/null
+++ b/test/01_contraction/complex_scale_contraction_test.cpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+
+#include "contraction_test.hpp"
+#include "contraction_test_helpers.hpp"
+
+class ComplexScaleContractionTest : public hiptensor::ContractionTest
+{
+};
+
+TEST_P(ComplexScaleContractionTest, RunKernel)
+{
+    static bool ranWarmup = false;
+    if(!ranWarmup)
+    {
+        this->Warmup();
+        ranWarmup = true;
+    }
+    this->RunKernel();
+}
+
+INSTANTIATE_TEST_SUITE_P(ContractionTests, ComplexScaleContractionTest, load_config_helper());
diff --git a/test/01_contraction/configs/complex_scale_test_params.yaml b/test/01_contraction/configs/complex_scale_test_params.yaml
new file mode 100644
index 00000000..89f9e736
--- /dev/null
+++ b/test/01_contraction/configs/complex_scale_test_params.yaml
@@ -0,0 +1,30 @@
+---
+Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
+Tensor Data Types:
+  - [ HIP_C_32F, HIP_C_32F, NONE_TYPE, HIP_C_32F, HIP_C_32F ]
+  - [ HIP_C_64F, HIP_C_64F, NONE_TYPE, HIP_C_64F, HIP_C_64F ]
+Algorithm Types:
+  - HIPTENSOR_ALGO_DEFAULT
+  - HIPTENSOR_ALGO_DEFAULT_PATIENT
+  # - HIPTENSOR_ALGO_ACTOR_CRITIC
+Operators:
+  - HIPTENSOR_OP_IDENTITY
+Worksize Prefs:
+  - HIPTENSOR_WORKSPACE_RECOMMENDED
+  - HIPTENSOR_WORKSPACE_MIN
+  - HIPTENSOR_WORKSPACE_MAX
+Alphas:
+  - [0, 0]
+  - [1, 1]
+  - [1, 1]
+Betas:
+  - [2, 2]
+  - [0, 0]
+  - [2, 2]
+Lengths:
+  - [ 5, 6, 3, 4, 3, 4 ]
+  - [ 4, 3, 4, 3, 6, 5 ]
+  - [ 24, 18, 2, 4, 9, 2 ]
+Strides:
+  - []
+...

From 23c033fcbd20e2268bbe0f1e708cb197481f6773 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Fri, 22 Dec 2023 17:27:11 +0000
Subject: [PATCH 40/42] Support Complex alpha and beta in samples

- Add complex_scale_contraction_test.cpp
- Fixed bug in device_contraction_bilinear_complex.hpp
---
 .../device_contraction_bilinear_complex.hpp   | 290 ++++++++++++------
 .../device_contraction_scale_complex.hpp      |  95 +++---
 .../src/contraction/hiptensor_contraction.cpp |  11 +-
 library/src/data_types.cpp                    |  16 +
 library/src/include/data_types.hpp            |   5 +
 .../simple_bilinear_contraction.hpp           |  28 +-
 ...ction_bf16_bf16_bf16_bf16_compute_bf16.cpp |   5 +-
 ...ction_cf32_cf32_cf32_cf32_compute_cf32.cpp |   7 +-
 ...ontraction_f16_f16_f16_f16_compute_f16.cpp |   5 +-
 ...ntraction_f32_f32_f32_f32_compute_bf16.cpp |   5 +-
 ...ontraction_f32_f32_f32_f32_compute_f16.cpp |   5 +-
 ...ontraction_f32_f32_f32_f32_compute_f32.cpp |   5 +-
 ...ontraction_f64_f64_f64_f64_compute_f32.cpp |   5 +-
 ...ontraction_f64_f64_f64_f64_compute_f64.cpp |   5 +-
 .../simple_scale_contraction.hpp              |  15 +-
 ...ontraction_bf16_bf16_bf16_compute_bf16.cpp |   4 +-
 ...ontraction_cf32_cf32_cf32_compute_cf32.cpp |   6 +-
 ...le_contraction_f16_f16_f16_compute_f16.cpp |   4 +-
 ...e_contraction_f32_f32_f32_compute_bf16.cpp |   4 +-
 ...le_contraction_f32_f32_f32_compute_f16.cpp |   4 +-
 ...le_contraction_f32_f32_f32_compute_f32.cpp |   4 +-
 ...le_contraction_f64_f64_f64_compute_f32.cpp |   4 +-
 ...le_contraction_f64_f64_f64_compute_f64.cpp |   4 +-
 .../configs/complex_bilinear_test_params.yaml |   4 +-
 .../configs/complex_scale_test_params.yaml    |   4 +-
 test/device/common.hpp                        |   2 +-
 26 files changed, 326 insertions(+), 220 deletions(-)

diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
index 7fc09504..712ff3b0 100644
--- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -43,8 +43,10 @@ namespace ck
             using hiptensor::DeviceDeleter;
             using hiptensor::elementSpaceFromLengthsAndStrides;
 
-            using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex;
             using Bilinear        = ck::tensor_operation::element_wise::Bilinear;
+            using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex;
+            using Scale           = ck::tensor_operation::element_wise::Scale;
+            using ScaleComplex    = ck::tensor_operation::element_wise::ScaleComplex;
 
             // The following is a specialization class for bilinear contractions of complex types.
             // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
@@ -169,9 +171,13 @@ namespace ck
                                                     HIP_vector_type<ComputeDataType, 2>>
             {
                 // Complex device Op
-                using DeviceOp                      = DeviceContractionMultipleD_Xdl_CShuffle;
-                using CDEElementwiseOperation       = BilinearComplex;
-                using DecompCDEElementwiseOperation = Bilinear;
+                using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
+
+                // CDE Operations
+                using ScaleCDEElementwiseOperation          = ScaleComplex;
+                using DecompScaleCDEElementwiseOperation    = Scale;
+                using BilinearCDEElementwiseOperation       = BilinearComplex;
+                using DecompBilinearCDEElementwiseOperation = Bilinear;
 
                 // Complex types given through the interface
                 using ComplexA       = HIP_vector_type<ADataType, 2>;
@@ -202,7 +208,55 @@ namespace ck
 
                 // The internal operation that we will decompose the complex operations with.
                 // For complex will be either float or double
-                using DecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                using ScaleDecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    DecompA,
+                    DecompB,
+                    AccDataType,
+                    CShuffleDataType,
+                    ck::Tuple<>,
+                    DecompE,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    DecompScaleCDEElementwiseOperation,
+                    GemmSpec,
+                    NumGemmKPrefetchStage,
+                    BlockSize,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    AK1,
+                    BK1,
+                    MPerXDL,
+                    NPerXDL,
+                    MXdlPerWave,
+                    NXdlPerWave,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    ABlockTransferSrcVectorDim,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    ABlockLdsExtraM,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    BBlockTransferSrcVectorDim,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    BBlockLdsExtraN,
+                    CShuffleMXdlPerWavePerShuffle,
+                    CShuffleNXdlPerWavePerShuffle,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    CDEBlockTransferScalarPerVector_NPerBlock,
+                    DecompCompute,
+                    LoopSched>;
+
+                // The internal operation that we will decompose the complex operations with.
+                // For complex will be either float or double
+                using BilinearDecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
                     NumDimM,
                     NumDimN,
                     NumDimK,
@@ -214,7 +268,7 @@ namespace ck
                     DecompE,
                     AElementwiseOperation,
                     BElementwiseOperation,
-                    DecompCDEElementwiseOperation,
+                    DecompBilinearCDEElementwiseOperation,
                     GemmSpec,
                     NumGemmKPrefetchStage,
                     BlockSize,
@@ -251,13 +305,14 @@ namespace ck
                 // Argument
                 struct Argument : public BaseArgument
                 {
-                    using DecompArg = typename DecompOp::Argument;
+                    using ScaleDecompArgument    = typename ScaleDecompOp::Argument;
+                    using BilinearDecompArgument = typename BilinearDecompOp::Argument;
 
                     Argument(Argument&& other)
-                        : mArgs({std::move(other.mArgs[0]),
-                                 std::move(other.mArgs[1]),
-                                 std::move(other.mArgs[2]),
-                                 std::move(other.mArgs[3])})
+                        : mScaleArgs(
+                            {std::move(other.mScaleArgs[0]), std::move(other.mScaleArgs[1])})
+                        , mBilinearArgs({std::move(other.mBilinearArgs[0]),
+                                         std::move(other.mBilinearArgs[1])})
                     {
                     }
 
@@ -265,10 +320,10 @@ namespace ck
                     {
                         if(this != &other)
                         {
-                            mArgs[0] = std::move(other.mArgs[0]);
-                            mArgs[1] = std::move(other.mArgs[1]);
-                            mArgs[2] = std::move(other.mArgs[2]);
-                            mArgs[3] = std::move(other.mArgs[3]);
+                            mScaleArgs[0]    = std::move(other.mScaleArgs[0]);
+                            mScaleArgs[1]    = std::move(other.mScaleArgs[1]);
+                            mBilinearArgs[0] = std::move(other.mBilinearArgs[0]);
+                            mBilinearArgs[1] = std::move(other.mBilinearArgs[1]);
                         }
                         return *this;
                     }
@@ -287,7 +342,8 @@ namespace ck
                              const std::vector<index_t>&                         e_ms_ns_strides,
                              AElementwiseOperation                               a_element_op,
                              BElementwiseOperation                               b_element_op,
-                             CDEElementwiseOperation                             cde_element_op) : element_op(cde_element_op)
+                             BilinearCDEElementwiseOperation                     cde_element_op)
+                        : element_op(cde_element_op)
                     {
                         // Take the incoming arguments, treat them as complex.
 
@@ -310,7 +366,7 @@ namespace ck
                         mE_real.reset(nullptr);
                         mE_imag.reset(nullptr);
 
-                        mE_grid = p_e_grid;
+                        mE_grid       = p_e_grid;
                         auto blockDim = dim3(1024);
 
                         auto decompGrid = [blockDim](auto&       out_r,
@@ -334,36 +390,34 @@ namespace ck
                             }
                         };
 
+                        // Decompose the incoming data from AOS->SOA
                         decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA);
                         decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB);
                         decompGrid(mD_real, mD_imag, (const ComplexDs*)p_ds_grid[0], elementsD);
                         decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE);
 
-                        auto allocArgs = [a_ms_ks_lengths,
-                                          a_ms_ks_strides,
-                                          b_ns_ks_lengths,
-                                          b_ns_ks_strides,
-                                          ds_ms_ns_lengths,
-                                          ds_ms_ns_strides,
-                                          e_ms_ns_lengths,
-                                          e_ms_ns_strides,
-                                          a_element_op,
-                                          b_element_op](auto&       out_e,
-                                                        auto const& in_a,
-                                                        auto const& in_b,
-                                                        auto const& in_d,
-                                                        auto const& cde_element_op) {
-                            return std::make_unique<typename DecompOp::Argument>(
+                        auto allocScaleArgs = [a_ms_ks_lengths,
+                                               a_ms_ks_strides,
+                                               b_ns_ks_lengths,
+                                               b_ns_ks_strides,
+                                               e_ms_ns_lengths,
+                                               e_ms_ns_strides,
+                                               a_element_op,
+                                               b_element_op](auto&       out_e,
+                                                             auto const& in_a,
+                                                             auto const& in_b,
+                                                             auto const& cde_element_op) {
+                            return std::make_unique<ScaleDecompArgument>(
                                 in_a.get(),
                                 in_b.get(),
-                                std::array<void const*, 1>{in_d.get()},
+                                std::array<void const*, 0>{},
                                 out_e.get(),
                                 a_ms_ks_lengths,
                                 a_ms_ks_strides,
                                 b_ns_ks_lengths,
                                 b_ns_ks_strides,
-                                ds_ms_ns_lengths,
-                                ds_ms_ns_strides,
+                                std::array<std::vector<index_t>, 0>{},
+                                std::array<std::vector<index_t>, 0>{},
                                 e_ms_ns_lengths,
                                 e_ms_ns_strides,
                                 a_element_op,
@@ -371,46 +425,88 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, DecompCDEElementwiseOperation{1.0f, 1.0f});
-                        mArgs[1] = allocArgs(mE_real,
-                                             mA_imag,
-                                             mB_imag,
-                                             mE_real,
-                                             DecompCDEElementwiseOperation{-1.0f,
-                                                                     1.0f});
-                        mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, DecompCDEElementwiseOperation{1.0f, 1.0f});
-                        mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag,
-                                             DecompCDEElementwiseOperation{1.0f , 1.0f});
+                        auto allocBilinearArgs = [a_ms_ks_lengths,
+                                                  a_ms_ks_strides,
+                                                  b_ns_ks_lengths,
+                                                  b_ns_ks_strides,
+                                                  e_ms_ns_lengths,
+                                                  e_ms_ns_strides,
+                                                  a_element_op,
+                                                  b_element_op](auto&       out_e,
+                                                                auto const& in_a,
+                                                                auto const& in_b,
+                                                                auto const& in_d,
+                                                                auto const& cde_element_op) {
+                            return std::make_unique<BilinearDecompArgument>(
+                                in_a.get(),
+                                in_b.get(),
+                                std::array<void const*, 1>{in_d.get()},
+                                out_e.get(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                std::array<std::vector<index_t>, 1>{e_ms_ns_lengths},
+                                std::array<std::vector<index_t>, 1>{e_ms_ns_strides},
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+                        };
 
+                        mScaleArgs[0] = allocScaleArgs(
+                            mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[0]
+                            = allocBilinearArgs(mE_real,
+                                                mA_imag,
+                                                mB_imag,
+                                                mE_real,
+                                                DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f});
+
+                        mScaleArgs[1] = allocScaleArgs(
+                            mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[1]
+                            = allocBilinearArgs(mE_imag,
+                                                mA_imag,
+                                                mB_real,
+                                                mE_imag,
+                                                DecompBilinearCDEElementwiseOperation{1.0f, 1.0f});
+
+                        // TODO UNCOMMENT WHEN DONE
                         // original
-                        /* TODO :Uncomment once done
-                        mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op);
-                        mArgs[1] = allocArgs(mE_real,
-                                             mA_imag,
-                                             mB_imag,
-                                             mE_real,
-                                             CDEElementwiseOperation{cde_element_op.alpha_ * -1.0f,
-                                                                     1.0f});
-                        mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op);
-                        mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag,
-                                             CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f});*/
+                        /*mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op);
+                        mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op);
+                        mBilinearArgs[0] = allocBilinearArgs(
+                            mE_real,
+                            mA_imag,
+                            mB_imag,
+                            mE_real,
+                            BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f});
+                        mBilinearArgs[1] = allocBilinearArgs(
+                            mE_imag,
+                            mA_imag,
+                            mB_real,
+                            mE_imag,
+                            BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});*/
                     }
 
                     void Print() const
                     {
-                        std::cout << "Args0:" << std::endl;
-                        mArgs[0]->Print();
-                        std::cout << "Args1:" << std::endl;
-                        mArgs[1]->Print();
-                        std::cout << "Args2:" << std::endl;
-                        mArgs[2]->Print();
-                        std::cout << "Args3:" << std::endl;
-                        mArgs[3]->Print();
+                        std::cout << "ScaleArgs0:" << std::endl;
+                        mScaleArgs[0]->Print();
+                        std::cout << "ScaleArgs1:" << std::endl;
+                        mScaleArgs[1]->Print();
+                        std::cout << "BilinearArgs0:" << std::endl;
+                        mBilinearArgs[0]->Print();
+                        std::cout << "BilinearArgs1:" << std::endl;
+                        mBilinearArgs[1]->Print();
                     }
 
                     //  private:
                     // Each argument set for complex:
-                    std::unique_ptr<typename DecompOp::Argument> mArgs[4];
+                    std::unique_ptr<ScaleDecompArgument>    mScaleArgs[2];
+                    std::unique_ptr<BilinearDecompArgument> mBilinearArgs[2];
 
                     template <typename DataT>
                     using DeviceArray = std::unique_ptr<DataT, DeviceDeleter>;
@@ -425,9 +521,9 @@ namespace ck
                     DeviceArray<DecompE>  mE_real;
                     DeviceArray<DecompE>  mE_imag;
 
-                    CDEElementwiseOperation element_op;
-                    void* mE_grid;
-                    index_t elementsE;
+                    BilinearCDEElementwiseOperation element_op;
+                    void*                           mE_grid;
+                    index_t                         elementsE;
                 };
 
                 // Invoker
@@ -436,12 +532,14 @@ namespace ck
                     using Argument = typename DeviceOp::Argument;
 
                     Invoker()
-                        : mInvoker(std::make_unique<typename DecompOp::Invoker>())
+                        : mScaleInvoker(std::make_unique<typename ScaleDecompOp::Invoker>())
+                        , mBilinearInvoker(std::make_unique<typename BilinearDecompOp::Invoker>())
                     {
                     }
 
                     Invoker(Invoker&& other)
-                        : mInvoker(std::move(other.mInvoker))
+                        : mScaleInvoker(std::move(other.mScaleInvoker))
+                        , mBilinearInvoker(std::move(other.mBilinearInvoker))
                     {
                     }
 
@@ -449,7 +547,8 @@ namespace ck
                     {
                         if(this != &other)
                         {
-                            mInvoker = std::move(other.mInvoker);
+                            mScaleInvoker    = std::move(other.mScaleInvoker);
+                            mBilinearInvoker = std::move(other.mBilinearInvoker);
                         }
                         return *this;
                     }
@@ -457,19 +556,23 @@ namespace ck
                     float Run(const Argument&     arg,
                               const StreamConfig& stream_config = StreamConfig{})
                     {
-                        auto r0 = mInvoker->Run(arg.mArgs[0].get(), stream_config);
-                        auto r1 = mInvoker->Run(arg.mArgs[1].get(), stream_config);
-                        auto r2 = mInvoker->Run(arg.mArgs[2].get(), stream_config);
-                        auto r3 = mInvoker->Run(arg.mArgs[3].get(), stream_config);
+                        auto r0 = mScaleInvoker->Run(arg.mScaleArgs[0].get(), stream_config);
+                        auto r1 = mScaleInvoker->Run(arg.mScaleArgs[1].get(), stream_config);
+                        auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config);
+                        auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config);
 
                         if(arg.mE_grid != nullptr)
                         {
                             auto blockDim = dim3(1024);
-                            auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x));
-                            hiptensor::mfma<<<gridDim, blockDim, 0>>>(
-                                arg.mE_real.get(), arg.mE_imag.get(), arg.mD_real.get(), arg.mD_imag.get(),
-                                ((ComplexE*)arg.mE_grid), arg.element_op.alpha_, arg.element_op.beta_,
-                                arg.elementsE);
+                            auto gridDim  = dim3(ceilDiv(arg.elementsE, blockDim.x));
+                            hiptensor::mfma<<<gridDim, blockDim, 0>>>(arg.mE_real.get(),
+                                                                      arg.mE_imag.get(),
+                                                                      arg.mD_real.get(),
+                                                                      arg.mD_imag.get(),
+                                                                      ((ComplexE*)arg.mE_grid),
+                                                                      arg.element_op.alpha_,
+                                                                      arg.element_op.beta_,
+                                                                      arg.elementsE);
                             //hiptensor::pack<<<gridDim, blockDim, 0>>>(
                             //    arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
                         }
@@ -484,15 +587,16 @@ namespace ck
                         return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
                     }
 
-                    std::unique_ptr<typename DecompOp::Invoker> mInvoker;
+                    std::unique_ptr<typename ScaleDecompOp::Invoker>    mScaleInvoker;
+                    std::unique_ptr<typename BilinearDecompOp::Invoker> mBilinearInvoker;
                 };
 
                 static bool IsSupportedArgument(const Argument& arg)
                 {
-                    return DecompOp::IsSupportedArgument(*(arg.mArgs[0].get()))
-                           && DecompOp::IsSupportedArgument(*(arg.mArgs[1].get()))
-                           && DecompOp::IsSupportedArgument(*(arg.mArgs[2].get()))
-                           && DecompOp::IsSupportedArgument(*(arg.mArgs[3].get()));
+                    return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[0].get()))
+                           && ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[1].get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get()));
                 }
 
                 // polymorphic
@@ -510,10 +614,14 @@ namespace ck
                     // Call the base, then fwd to each arg.
                     this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s);
                     auto* arg = dynamic_cast<Argument*>(p_arg);
-                    this->BaseOperator::SetWorkSpacePointer(arg->mArgs[0].get(), p_workspace, s);
-                    this->BaseOperator::SetWorkSpacePointer(arg->mArgs[1].get(), p_workspace, s);
-                    this->BaseOperator::SetWorkSpacePointer(arg->mArgs[2].get(), p_workspace, s);
-                    this->BaseOperator::SetWorkSpacePointer(arg->mArgs[3].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mScaleArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mScaleArgs[1].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[1].get(), p_workspace, s);
                 }
 
                 static auto MakeArgument(
@@ -531,7 +639,7 @@ namespace ck
                     const std::vector<index_t>&                         e_ms_ns_strides,
                     AElementwiseOperation                               a_element_op,
                     BElementwiseOperation                               b_element_op,
-                    CDEElementwiseOperation                             cde_element_op)
+                    BilinearCDEElementwiseOperation                     cde_element_op)
                 {
                     return Argument{p_a,
                                     p_b,
@@ -571,7 +679,7 @@ namespace ck
                     const std::vector<index_t>&                         e_ms_ns_strides,
                     AElementwiseOperation                               a_element_op,
                     BElementwiseOperation                               b_element_op,
-                    CDEElementwiseOperation                             cde_element_op) override
+                    BilinearCDEElementwiseOperation                     cde_element_op) override
                 {
                     return std::make_unique<Argument>(p_a,
                                                       p_b,
diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp
index 47b84e2c..b875db3b 100644
--- a/library/src/contraction/device/device_contraction_scale_complex.hpp
+++ b/library/src/contraction/device/device_contraction_scale_complex.hpp
@@ -43,10 +43,10 @@ namespace ck
             using hiptensor::DeviceDeleter;
             using hiptensor::elementSpaceFromLengthsAndStrides;
 
-            using Bilinear          = ck::tensor_operation::element_wise::Bilinear;
-            using BilinearComplex   = ck::tensor_operation::element_wise::BilinearComplex;
-            using Scale             = ck::tensor_operation::element_wise::Scale;
-            using ScaleComplex      = ck::tensor_operation::element_wise::ScaleComplex;
+            using Bilinear        = ck::tensor_operation::element_wise::Bilinear;
+            using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex;
+            using Scale           = ck::tensor_operation::element_wise::Scale;
+            using ScaleComplex    = ck::tensor_operation::element_wise::ScaleComplex;
 
             // The following is a specialization class for bilinear contractions of complex types.
             // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
@@ -307,8 +307,8 @@ namespace ck
                     using BilinearDecompArgument = typename BilinearDecompOp::Argument;
 
                     Argument(Argument&& other)
-                        : mScaleArgs({std::move(other.mScaleArgs[0]),
-                                      std::move(other.mScaleArgs[1])})
+                        : mScaleArgs(
+                            {std::move(other.mScaleArgs[0]), std::move(other.mScaleArgs[1])})
                         , mBilinearArgs({std::move(other.mBilinearArgs[0]),
                                          std::move(other.mBilinearArgs[1])})
                     {
@@ -318,10 +318,10 @@ namespace ck
                     {
                         if(this != &other)
                         {
-                            mScaleArgs[0]       = std::move(other.mScaleArgs[0]);
-                            mScaleArgs[1]       = std::move(other.mScaleArgs[1]);
-                            mBilinearArgs[0]    = std::move(other.mBilinearArgs[0]);
-                            mBilinearArgs[1]    = std::move(other.mBilinearArgs[1]);
+                            mScaleArgs[0]    = std::move(other.mScaleArgs[0]);
+                            mScaleArgs[1]    = std::move(other.mScaleArgs[1]);
+                            mBilinearArgs[0] = std::move(other.mBilinearArgs[0]);
+                            mBilinearArgs[1] = std::move(other.mBilinearArgs[1]);
                         }
                         return *this;
                     }
@@ -340,7 +340,8 @@ namespace ck
                              const std::vector<index_t>&                         e_ms_ns_strides,
                              AElementwiseOperation                               a_element_op,
                              BElementwiseOperation                               b_element_op,
-                             ScaleCDEElementwiseOperation                        cde_element_op) : element_op(cde_element_op)
+                             ScaleCDEElementwiseOperation                        cde_element_op)
+                        : element_op(cde_element_op)
                     {
                         // Take the incoming arguments, treat them as complex.
 
@@ -359,7 +360,7 @@ namespace ck
                         mE_real.reset(nullptr);
                         mE_imag.reset(nullptr);
 
-                        mE_grid = p_e_grid;
+                        mE_grid       = p_e_grid;
                         auto blockDim = dim3(1024);
 
                         auto decompGrid = [blockDim](auto&       out_r,
@@ -392,8 +393,6 @@ namespace ck
                                                a_ms_ks_strides,
                                                b_ns_ks_lengths,
                                                b_ns_ks_strides,
-                                               ds_ms_ns_lengths,
-                                               ds_ms_ns_strides,
                                                e_ms_ns_lengths,
                                                e_ms_ns_strides,
                                                a_element_op,
@@ -410,8 +409,8 @@ namespace ck
                                 a_ms_ks_strides,
                                 b_ns_ks_lengths,
                                 b_ns_ks_strides,
-                                ds_ms_ns_lengths,
-                                ds_ms_ns_strides,
+                                std::array<std::vector<index_t>, 0>{},
+                                std::array<std::vector<index_t>, 0>{},
                                 e_ms_ns_lengths,
                                 e_ms_ns_strides,
                                 a_element_op,
@@ -423,8 +422,6 @@ namespace ck
                                                   a_ms_ks_strides,
                                                   b_ns_ks_lengths,
                                                   b_ns_ks_strides,
-                                                  ds_ms_ns_lengths,
-                                                  ds_ms_ns_strides,
                                                   e_ms_ns_lengths,
                                                   e_ms_ns_strides,
                                                   a_element_op,
@@ -451,22 +448,23 @@ namespace ck
                                 cde_element_op);
                         };
 
-                        mScaleArgs[0]    = allocScaleArgs(mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f});
-                        mBilinearArgs[0] = allocBilinearArgs(
-                            mE_real,
-                            mA_imag,
-                            mB_imag,
-                            mE_real,
-                            DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f});
-
-                        mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f});
-                        mBilinearArgs[1] = allocBilinearArgs(
-                            mE_imag,
-                            mA_imag,
-                            mB_real,
-                            mE_imag,
-                            DecompBilinearCDEElementwiseOperation{1.0f, 1.0f});
-
+                        mScaleArgs[0] = allocScaleArgs(
+                            mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[0]
+                            = allocBilinearArgs(mE_real,
+                                                mA_imag,
+                                                mB_imag,
+                                                mE_real,
+                                                DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f});
+
+                        mScaleArgs[1] = allocScaleArgs(
+                            mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[1]
+                            = allocBilinearArgs(mE_imag,
+                                                mA_imag,
+                                                mB_real,
+                                                mE_imag,
+                                                DecompBilinearCDEElementwiseOperation{1.0f, 1.0f});
 
                         // TODO UNCOMMENT WHEN DONE
                         // original
@@ -507,16 +505,16 @@ namespace ck
                     using DeviceArray = std::unique_ptr<DataT, DeviceDeleter>;
 
                     // Manage extra memory for AOS->SOA
-                    DeviceArray<DecompA>  mA_real;
-                    DeviceArray<DecompA>  mA_imag;
-                    DeviceArray<DecompB>  mB_real;
-                    DeviceArray<DecompB>  mB_imag;
-                    DeviceArray<DecompE>  mE_real;
-                    DeviceArray<DecompE>  mE_imag;
+                    DeviceArray<DecompA> mA_real;
+                    DeviceArray<DecompA> mA_imag;
+                    DeviceArray<DecompB> mB_real;
+                    DeviceArray<DecompB> mB_imag;
+                    DeviceArray<DecompE> mE_real;
+                    DeviceArray<DecompE> mE_imag;
 
                     ScaleCDEElementwiseOperation element_op;
-                    void* mE_grid;
-                    index_t elementsE;
+                    void*                        mE_grid;
+                    index_t                      elementsE;
                 };
 
                 // Invoker
@@ -557,10 +555,13 @@ namespace ck
                         if(arg.mE_grid != nullptr)
                         {
                             auto blockDim = dim3(1024);
-                            auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x));
+                            auto gridDim  = dim3(ceilDiv(arg.elementsE, blockDim.x));
 
-                            hiptensor::multiply<<<gridDim, blockDim, 0>>>(
-                                arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.element_op.scale_, arg.elementsE);
+                            hiptensor::multiply<<<gridDim, blockDim, 0>>>(arg.mE_real.get(),
+                                                                          arg.mE_imag.get(),
+                                                                          ((ComplexE*)arg.mE_grid),
+                                                                          arg.element_op.scale_,
+                                                                          arg.elementsE);
                             //hiptensor::pack<<<gridDim, blockDim, 0>>>(
                             //    arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
                         }
@@ -602,7 +603,8 @@ namespace ck
                     // Call the base, then fwd to each arg.
                     this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s);
                     auto* arg = dynamic_cast<Argument*>(p_arg);
-                    this->BaseOperator::SetWorkSpacePointer(arg->mScaleArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mScaleArgs[0].get(), p_workspace, s);
                     this->BaseOperator::SetWorkSpacePointer(
                         arg->mScaleArgs[1].get(), p_workspace, s);
                     this->BaseOperator::SetWorkSpacePointer(
@@ -722,4 +724,3 @@ namespace ck
 } // namespace ck
 
 #endif // HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP
-
diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index d063ebf5..eb7d8919 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -582,9 +582,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         }
         else
         {
-            auto alphaValue
-                = hiptensor::readVal<double>(alpha, plan->mContractionDesc.mComputeType);
-            snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%.6lf", alphaValue);
+            auto alphaValue = hiptensor::readVal<hiptensor::ScalarData>(
+                alpha, plan->mContractionDesc.mComputeType);
+            snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%s", std::to_string(alphaValue).c_str());
         }
 
         if(beta == nullptr)
@@ -593,8 +593,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         }
         else
         {
-            auto betaValue = hiptensor::readVal<double>(beta, plan->mContractionDesc.mComputeType);
-            snprintf(betaMsg, sizeof(betaMsg), "beta=%.6lf", betaValue);
+            auto betaValue = hiptensor::readVal<hiptensor::ScalarData>(
+                beta, plan->mContractionDesc.mComputeType);
+            snprintf(betaMsg, sizeof(betaMsg), "beta=%s", std::to_string(betaValue).c_str());
         }
     }
     else
diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp
index 69e29b50..5a31a91f 100644
--- a/library/src/data_types.cpp
+++ b/library/src/data_types.cpp
@@ -327,3 +327,19 @@ bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType)
 {
     return !(computeType == hipType);
 }
+
+namespace std
+{
+    std::string to_string(const hiptensor::ScalarData& value)
+    {
+        if(value.mType == HIPTENSOR_COMPUTE_C32F || value.mType == HIPTENSOR_COMPUTE_C64F)
+        {
+            return string() + "[" + to_string(value.mComplex.x) + ", " + to_string(value.mComplex.y)
+                   + "]";
+        }
+        else
+        {
+            return to_string(value.mReal);
+        }
+    }
+}
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
index 900b2069..db9ff6c7 100644
--- a/library/src/include/data_types.hpp
+++ b/library/src/include/data_types.hpp
@@ -115,6 +115,11 @@ bool operator==(hiptensorComputeType_t computeType, hipDataType hipType);
 bool operator!=(hipDataType hipType, hiptensorComputeType_t computeType);
 bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType);
 
+namespace std
+{
+    std::string to_string(const hiptensor::ScalarData& value);
+}
+
 #include "data_types_impl.hpp"
 
 #endif // HIPTENSOR_LIBRARY_DATA_TYPES_HPP
diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp
index b0348b91..95c5d0f6 100644
--- a/samples/01_contraction/simple_bilinear_contraction.hpp
+++ b/samples/01_contraction/simple_bilinear_contraction.hpp
@@ -37,25 +37,12 @@
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
-          typename computeDataType,
           hipDataType            typeA,
           hipDataType            typeB,
           hipDataType            typeC,
           hiptensorComputeType_t typeCompute>
-int bilinearContractionSample()
+int bilinearContractionSample(void* alpha, void* beta)
 {
-    computeDataType alpha, beta;
-    if constexpr(std::is_same_v<computeDataType, hipFloatComplex> || std::is_same_v<computeDataType, hipDoubleComplex>)
-    {
-        alpha = computeDataType(1.0, 1.0);
-        beta = computeDataType(1.0, 1.0);
-    }
-    else
-    {
-        alpha = (computeDataType)1.0f;
-        beta = (computeDataType)1.0f;
-    }
-
     /**********************
    * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
    *C_{m,n,u,v}
@@ -280,17 +267,8 @@ int bilinearContractionSample()
 
     std::cout << "Launching contraction kernel..." << std::endl;
 
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               (void*)&beta,
-                                               C_d,
-                                               C_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(
+        handle, &plan, alpha, A_d, B_d, beta, C_d, C_d, workspace, worksize, 0 /* stream */));
 
 #if !NDEBUG
     bool printElements = false;
diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
index f6714a2f..52915200 100644
--- a/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
@@ -46,12 +46,13 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeC       = HIP_R_16BF;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
 
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     floatTypeCompute,
                                      typeA,
                                      typeB,
                                      typeC,
-                                     typeCompute>();
+                                     typeCompute>(&alpha, &beta);
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp
index 648675f6..5b3bb7cc 100644
--- a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp
@@ -39,19 +39,20 @@ int main(int argc, char* argv[])
     typedef hipFloatComplex ADataType;
     typedef hipFloatComplex BDataType;
     typedef hipFloatComplex CDataType;
-    typedef hipFloatComplex ComputeDataType;
+    typedef hipFloatComplex floatTypeCompute;
 
     constexpr hipDataType            typeA       = HIP_C_32F;
     constexpr hipDataType            typeB       = HIP_C_32F;
     constexpr hipDataType            typeC       = HIP_C_32F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F;
 
+    floatTypeCompute alpha{1.0f, 1.0f};
+    floatTypeCompute beta{1.0f, 1.0f};
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     ComputeDataType,
                                      typeA,
                                      typeB,
                                      typeC,
-                                     typeCompute>();
+                                     typeCompute>(&alpha, &beta);
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
index 40708c77..8de0c534 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
@@ -46,12 +46,13 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeC       = HIP_R_16F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
 
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     floatTypeCompute,
                                      typeA,
                                      typeB,
                                      typeC,
-                                     typeCompute>();
+                                     typeCompute>(&alpha, &beta);
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
index 42f60ecb..6ce6d3c0 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
@@ -46,12 +46,13 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeC       = HIP_R_32F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF;
 
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     floatTypeCompute,
                                      typeA,
                                      typeB,
                                      typeC,
-                                     typeCompute>();
+                                     typeCompute>(&alpha, &beta);
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
index d39a4fca..d4e28761 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
@@ -46,12 +46,13 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeC       = HIP_R_32F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F;
 
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     floatTypeCompute,
                                      typeA,
                                      typeB,
                                      typeC,
-                                     typeCompute>();
+                                     typeCompute>(&alpha, &beta);
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
index ee046145..e493f1c3 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
@@ -46,12 +46,13 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeC       = HIP_R_32F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
 
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     floatTypeCompute,
                                      typeA,
                                      typeB,
                                      typeC,
-                                     typeCompute>();
+                                     typeCompute>(&alpha, &beta);
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
index 673c4768..0faffc3e 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
@@ -46,12 +46,13 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeC       = HIP_R_64F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
 
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     floatTypeCompute,
                                      typeA,
                                      typeB,
                                      typeC,
-                                     typeCompute>();
+                                     typeCompute>(&alpha, &beta);
 }
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
index 412ebbc5..d5024eba 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
@@ -46,12 +46,13 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeC       = HIP_R_64F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F;
 
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
     return bilinearContractionSample<ADataType,
                                      BDataType,
                                      CDataType,
-                                     floatTypeCompute,
                                      typeA,
                                      typeB,
                                      typeC,
-                                     typeCompute>();
+                                     typeCompute>(&alpha, &beta);
 }
diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp
index 45914e30..5db4598d 100644
--- a/samples/01_contraction/simple_scale_contraction.hpp
+++ b/samples/01_contraction/simple_scale_contraction.hpp
@@ -37,23 +37,12 @@
 template <typename ADataType,
           typename BDataType,
           typename DDataType,
-          typename computeDataType,
           hipDataType            typeA,
           hipDataType            typeB,
           hipDataType            typeD,
           hiptensorComputeType_t typeCompute>
-int scaleContractionSample()
+int scaleContractionSample(void* alpha)
 {
-    computeDataType alpha;
-    if constexpr(std::is_same_v<computeDataType, hipFloatComplex> || std::is_same_v<computeDataType, hipDoubleComplex>)
-    {
-        alpha = computeDataType(1.0, 1.0);
-    }
-    else
-    {
-        alpha = (computeDataType)1.0f;
-    }
-
     /**********************
    * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
    **********************/
@@ -272,7 +261,7 @@ int scaleContractionSample()
 
     CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
                                                &plan,
-                                               (void*)&alpha,
+                                               alpha,
                                                A_d,
                                                B_d,
                                                nullptr,
diff --git a/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
index 7b0f8b6c..5a991dbc 100644
--- a/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
+++ b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
@@ -40,12 +40,12 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeD       = HIP_R_16BF;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
 
+    floatTypeCompute alpha = 1;
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  floatTypeCompute,
                                   typeA,
                                   typeB,
                                   typeD,
-                                  typeCompute>();
+                                  typeCompute>(&alpha);
 }
diff --git a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp
index 0f6eaac3..a3eb5e6f 100644
--- a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp
+++ b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp
@@ -39,19 +39,19 @@ int main(int argc, char* argv[])
     typedef hipFloatComplex ADataType;
     typedef hipFloatComplex BDataType;
     typedef hipFloatComplex DDataType;
-    typedef hipFloatComplex ComputeDataType;
+    typedef hipFloatComplex floatTypeCompute;
 
     constexpr hipDataType            typeA       = HIP_C_32F;
     constexpr hipDataType            typeB       = HIP_C_32F;
     constexpr hipDataType            typeD       = HIP_C_32F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F;
 
+    floatTypeCompute alpha(1, 1);
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  ComputeDataType,
                                   typeA,
                                   typeB,
                                   typeD,
-                                  typeCompute>();
+                                  typeCompute>(&alpha);
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
index d69193f0..9283283b 100644
--- a/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
@@ -46,12 +46,12 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeD       = HIP_R_16F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
 
+    floatTypeCompute alpha = 1;
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  floatTypeCompute,
                                   typeA,
                                   typeB,
                                   typeD,
-                                  typeCompute>();
+                                  typeCompute>(&alpha);
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
index c11b8ded..dac5e18b 100644
--- a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
@@ -47,12 +47,12 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeD       = HIP_R_32F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF;
 
+    floatTypeCompute alpha = floatTypeCompute{1.0f};
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  floatTypeCompute,
                                   typeA,
                                   typeB,
                                   typeD,
-                                  typeCompute>();
+                                  typeCompute>(&alpha);
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
index 377ee707..155f9585 100644
--- a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
@@ -47,12 +47,12 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeD       = HIP_R_32F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F;
 
+    floatTypeCompute alpha = 1;
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  floatTypeCompute,
                                   typeA,
                                   typeB,
                                   typeD,
-                                  typeCompute>();
+                                  typeCompute>(&alpha);
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
index e53cc468..2def291d 100644
--- a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
@@ -47,12 +47,12 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeD       = HIP_R_32F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
 
+    floatTypeCompute alpha = 1;
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  floatTypeCompute,
                                   typeA,
                                   typeB,
                                   typeD,
-                                  typeCompute>();
+                                  typeCompute>(&alpha);
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
index fdec48ab..7b2a9c95 100644
--- a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
@@ -46,12 +46,12 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeD       = HIP_R_64F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
 
+    floatTypeCompute alpha = 1;
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  floatTypeCompute,
                                   typeA,
                                   typeB,
                                   typeD,
-                                  typeCompute>();
+                                  typeCompute>(&alpha);
 }
diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp
index 5eb94c15..201741e9 100644
--- a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp
+++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp
@@ -46,12 +46,12 @@ int main(int argc, char* argv[])
     constexpr hipDataType            typeD       = HIP_R_64F;
     constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F;
 
+    floatTypeCompute alpha = 1;
     return scaleContractionSample<ADataType,
                                   BDataType,
                                   DDataType,
-                                  floatTypeCompute,
                                   typeA,
                                   typeB,
                                   typeD,
-                                  typeCompute>();
+                                  typeCompute>(&alpha);
 }
diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml
index 0d59c05d..b9fe7876 100644
--- a/test/01_contraction/configs/complex_bilinear_test_params.yaml
+++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml
@@ -16,11 +16,11 @@ Worksize Prefs:
 Alphas:
   - [0, 0]
   - [1, 1]
-  - [1, 1]
+  - [1.1, 1.2]
 Betas:
   - [2, 2]
   - [0, 0]
-  - [2, 2]
+  - [2.2, 2.3]
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
diff --git a/test/01_contraction/configs/complex_scale_test_params.yaml b/test/01_contraction/configs/complex_scale_test_params.yaml
index 89f9e736..355a5050 100644
--- a/test/01_contraction/configs/complex_scale_test_params.yaml
+++ b/test/01_contraction/configs/complex_scale_test_params.yaml
@@ -16,11 +16,11 @@ Worksize Prefs:
 Alphas:
   - [0, 0]
   - [1, 1]
-  - [1, 1]
+  - [1.1, 1.2]
 Betas:
   - [2, 2]
   - [0, 0]
-  - [2, 2]
+  - [2.2, 2.3]
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
diff --git a/test/device/common.hpp b/test/device/common.hpp
index 392c74c9..283a9035 100644
--- a/test/device/common.hpp
+++ b/test/device/common.hpp
@@ -84,7 +84,7 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed)
         }
         else
         {
-            auto value  = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize;
+            auto value  = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize;
             data[index] = static_cast<DataType>(value);
         }
     }

From 48abe4a97bb2d567a022eb191ef9d3f1467c3c73 Mon Sep 17 00:00:00 2001
From: Meena Karunanidhi <Meena.Karunanidhi@amd.com>
Date: Wed, 27 Dec 2023 13:29:43 -0500
Subject: [PATCH 41/42] Cleanup

---
 .../src/contraction/contraction_pack_util.hpp | 22 -------------------
 .../device_contraction_bilinear_complex.hpp   | 19 ----------------
 .../device_contraction_scale_complex.hpp      | 19 ----------------
 3 files changed, 60 deletions(-)

diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp
index 237e9d7f..5032fa8a 100644
--- a/library/src/contraction/contraction_pack_util.hpp
+++ b/library/src/contraction/contraction_pack_util.hpp
@@ -119,28 +119,6 @@ namespace hiptensor
         }
     }
 
-    /**
-     * \brief This function packs non-structured data (float / double)
-     *        into structured data (hipFloatComplex / hipDoubleComplex).
-     */
-    template<typename InputType, typename OutputType>
-    __global__ void pack(const InputType* in_real, InputType* in_img, OutputType *out, int length)
-    {
-        int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-        if(idx < length)
-        {
-            if constexpr(std::is_same_v<OutputType, hipFloatComplex>)
-            {
-                out[idx] = make_hipFloatComplex((float)in_real[idx], (float)in_img[idx]);
-            }
-            else if constexpr(std::is_same_v<OutputType, hipDoubleComplex>)
-            {
-                out[idx] = make_hipDoubleComplex((double)in_real[idx], (double)in_img[idx]);
-            }
-        }
-    }
- 
     struct DeviceDeleter
     {
         void operator()(void* ptr)
diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
index 712ff3b0..307ecb1c 100644
--- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -472,23 +472,6 @@ namespace ck
                                                 mB_real,
                                                 mE_imag,
                                                 DecompBilinearCDEElementwiseOperation{1.0f, 1.0f});
-
-                        // TODO UNCOMMENT WHEN DONE
-                        // original
-                        /*mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op);
-                        mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op);
-                        mBilinearArgs[0] = allocBilinearArgs(
-                            mE_real,
-                            mA_imag,
-                            mB_imag,
-                            mE_real,
-                            BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f});
-                        mBilinearArgs[1] = allocBilinearArgs(
-                            mE_imag,
-                            mA_imag,
-                            mB_real,
-                            mE_imag,
-                            BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});*/
                     }
 
                     void Print() const
@@ -573,8 +556,6 @@ namespace ck
                                                                       arg.element_op.alpha_,
                                                                       arg.element_op.beta_,
                                                                       arg.elementsE);
-                            //hiptensor::pack<<<gridDim, blockDim, 0>>>(
-                            //    arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
                         }
 
                         return r0 + r1 + r2 + r3;
diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp
index b875db3b..5b70cc11 100644
--- a/library/src/contraction/device/device_contraction_scale_complex.hpp
+++ b/library/src/contraction/device/device_contraction_scale_complex.hpp
@@ -465,23 +465,6 @@ namespace ck
                                                 mB_real,
                                                 mE_imag,
                                                 DecompBilinearCDEElementwiseOperation{1.0f, 1.0f});
-
-                        // TODO UNCOMMENT WHEN DONE
-                        // original
-                        /*mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op);
-                        mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op);
-                        mBilinearArgs[0] = allocBilinearArgs(
-                            mE_real,
-                            mA_imag,
-                            mB_imag,
-                            mE_real,
-                            BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f});
-                        mBilinearArgs[1] = allocBilinearArgs(
-                            mE_imag,
-                            mA_imag,
-                            mB_real,
-                            mE_imag,
-                            BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});*/
                     }
 
                     void Print() const
@@ -562,8 +545,6 @@ namespace ck
                                                                           ((ComplexE*)arg.mE_grid),
                                                                           arg.element_op.scale_,
                                                                           arg.elementsE);
-                            //hiptensor::pack<<<gridDim, blockDim, 0>>>(
-                            //    arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE);
                         }
 
                         return r0 + r1 + r2 + r3;

From 348e28144e5f2502aed9c00aac9999ceb1e2cd29 Mon Sep 17 00:00:00 2001
From: Cong Ma <congma13@amd.com>
Date: Fri, 29 Dec 2023 01:34:39 +0000
Subject: [PATCH 42/42] Set unit test difference threshold to epsilon of
 compute type

- New single kernel selection. To be improved.
- Used instance selected by brute force to compute tensor with 1 as the
most right stride
- Fixed bug that used data type id as compute data type id
---
 .../src/contraction/contraction_selection.cpp | 1404 ++++-------------
 .../permutation_cpu_reference_impl.hpp        |    2 +-
 .../configs/bilinear_test_params.yaml         |    4 +-
 .../configs/complex_bilinear_test_params.yaml |    4 +-
 .../configs/complex_scale_test_params.yaml    |    4 +-
 .../configs/scale_test_params.yaml            |    4 +-
 test/01_contraction/contraction_test.cpp      |   28 +-
 .../permutation_cpu_impl_test.cpp             |    6 +-
 test/02_permutation/permutation_resource.cpp  |    2 +-
 test/02_permutation/permutation_test.cpp      |    6 +-
 test/utils.hpp                                |   75 +-
 11 files changed, 408 insertions(+), 1131 deletions(-)

diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index 1f7b70a6..f96e8412 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -204,8 +204,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // TODO select unique_id
-            unique_id = 7255639152084218514ull;
+            unique_id = 11124293857315312720ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -253,8 +252,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // TODO select unique_id
-            unique_id = 7255639152084218514ull;
+            unique_id = 1953020431947874122ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -302,8 +300,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // TODO select unique_id
-            unique_id = 8689089455041651212ull;
+            unique_id = 14895098881714635802ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -351,8 +348,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // TODO select unique_id
-            unique_id = 8689089455041651212ull;
+            unique_id = 8517235228581081946ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -395,8 +391,7 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            // TODO select unique_id
-            unique_id = 1078559130597702989ull;
+            unique_id = 17313709378682913599ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -438,8 +433,8 @@ namespace hiptensor
             int d6 = a_ms_ks_lengths[3];
 
             size_t unique_id = 0;
-            // TODO select unique_id
-            unique_id = 6506383527825239632ull;
+
+            unique_id = 14397647188602189900ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -481,8 +476,8 @@ namespace hiptensor
             int d6 = a_ms_ks_lengths[3];
 
             size_t unique_id = 0;
-            // TODO select unique_id
-            unique_id = 14486135440731032454ull;
+
+            unique_id = 8339198051871565944ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -529,8 +524,8 @@ namespace hiptensor
             int d6 = a_ms_ks_lengths[3];
 
             size_t unique_id = 0;
-            // TODO select unique_id
-            unique_id = 11931735240548010466ull;
+
+            unique_id = 2724417728984064737ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -573,329 +568,50 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            if(d6 <= 43)
+            unique_id = 5943247903036531691ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
-                if(d5 <= 61)
-                {
-                    if(d3 <= 236)
-                    {
-                        if(d4 <= 519)
-                        {
-                            if(d1 <= 744)
-                            {
-                                if(d6 <= 8)
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                                else
-                                {
-                                    unique_id = 17304057348073251997ull;
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d3 <= 32)
-                            {
-                                unique_id = 17304057348073251997ull;
-                            }
-                            else
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d6 <= 2)
-                        {
-                            if(d5 <= 15)
-                            {
-                                unique_id = 17618515137355245877ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 1)
-                                {
-                                    unique_id = 10830479759059230274ull;
-                                }
-                                else
-                                {
-                                    if(d5 <= 32)
-                                    {
-                                        unique_id = 10830479759059230274ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 4671301146928673150ull;
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d5 <= 2)
-                            {
-                                if(d6 <= 8)
-                                {
-                                    unique_id = 17618515137355245877ull;
-                                }
-                                else
-                                {
-                                    unique_id = 10830479759059230274ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d1 <= 54)
-                                {
-                                    unique_id = 17304057348073251997ull;
-                                }
-                                else
-                                {
-                                    if(d4 <= 218)
-                                    {
-                                        if(d5 <= 36)
-                                        {
-                                            unique_id = 4671301146928673150ull;
-                                        }
-                                        else
-                                        {
-                                            if(d6 <= 31)
-                                            {
-                                                unique_id = 4671301146928673150ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 16481146763982821264ull;
-                                            }
-                                        }
-                                    }
-                                    else
-                                    {
-                                        if(d2 <= 50)
-                                        {
-                                            unique_id = 4671301146928673150ull;
-                                        }
-                                        else
-                                        {
-                                            if(d6 <= 31)
-                                            {
-                                                unique_id = 4671301146928673150ull;
-                                            }
-                                            else
-                                            {
-                                                if(d6 <= 32)
-                                                {
-                                                    unique_id = 10830479759059230274ull;
-                                                }
-                                                else
-                                                {
-                                                    unique_id = 4671301146928673150ull;
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d6 <= 18)
-                    {
-                        unique_id = 4671301146928673150ull;
-                    }
-                    else
-                    {
-                        if(d4 <= 557)
-                        {
-                            if(d2 <= 165)
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                            else
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d5 <= 68)
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                            else
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                        }
-                    }
-                }
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
             }
             else
             {
-                if(d5 <= 24)
-                {
-                    if(d3 <= 435)
-                    {
-                        if(d5 <= 7)
-                        {
-                            if(d5 <= 1)
-                            {
-                                unique_id = 3454820663416883703ull;
-                            }
-                            else
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 744)
-                            {
-                                unique_id = 17304057348073251997ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 60)
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                                else
-                                {
-                                    unique_id = 17304057348073251997ull;
-                                }
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d5 <= 1)
-                        {
-                            unique_id = 3454820663416883703ull;
-                        }
-                        else
-                        {
-                            if(d5 <= 13)
-                            {
-                                if(d5 <= 7)
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d6 <= 58)
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                                else
-                                {
-                                    if(d1 <= 642)
-                                    {
-                                        unique_id = 17304057348073251997ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 16481146763982821264ull;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d6 <= 54)
-                    {
-                        if(d5 <= 37)
-                        {
-                            if(d4 <= 556)
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                            else
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 222)
-                            {
-                                if(d4 <= 556)
-                                {
-                                    unique_id = 16481146763982821264ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d4 <= 44)
-                        {
-                            if(d3 <= 436)
-                            {
-                                unique_id = 17304057348073251997ull;
-                            }
-                            else
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 220)
-                            {
-                                if(d2 <= 107)
-                                {
-                                    unique_id = 17304057348073251997ull;
-                                }
-                                else
-                                {
-                                    unique_id = 16481146763982821264ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d3 <= 72)
-                                {
-                                    unique_id = 16481146763982821264ull;
-                                }
-                                else
-                                {
-                                    if(d2 <= 18)
-                                    {
-                                        unique_id = 4671301146928673150ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 16481146763982821264ull;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
             }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 17972447156160297755ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -910,7 +626,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR, float>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -929,6 +645,7 @@ namespace hiptensor
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
                          const uint64_t                                          workspaceSize)
         {
+
             int d1 = a_ms_ks_lengths[0];
             int d2 = a_ms_ks_lengths[1];
             int d3 = b_ns_ks_lengths[0];
@@ -938,322 +655,49 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            if(d6 <= 9)
+            unique_id = 3893144338697524749ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
-                if(d6 <= 4)
-                {
-                    unique_id = 9622108777680582053ull;
-                }
-                else
-                {
-                    if(d5 <= 16)
-                    {
-                        unique_id = 9622108777680582053ull;
-                    }
-                    else
-                    {
-                        if(d2 <= 196)
-                        {
-                            unique_id = 9622108777680582053ull;
-                        }
-                        else
-                        {
-                            if(d1 <= 113)
-                            {
-                                unique_id = 9622108777680582053ull;
-                            }
-                            else
-                            {
-                                if(d3 <= 219)
-                                {
-                                    unique_id = 9622108777680582053ull;
-                                }
-                                else
-                                {
-                                    unique_id = 13257779901106960809ull;
-                                }
-                            }
-                        }
-                    }
-                }
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
             }
             else
             {
-                if(d5 <= 8)
-                {
-                    if(d6 <= 28)
-                    {
-                        unique_id = 9622108777680582053ull;
-                    }
-                    else
-                    {
-                        if(d5 <= 2)
-                        {
-                            if(d6 <= 58)
-                            {
-                                unique_id = 9622108777680582053ull;
-                            }
-                            else
-                            {
-                                if(d5 <= 1)
-                                {
-                                    unique_id = 9622108777680582053ull;
-                                }
-                                else
-                                {
-                                    unique_id = 13257779901106960809ull;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d2 <= 163)
-                            {
-                                unique_id = 9622108777680582053ull;
-                            }
-                            else
-                            {
-                                if(d1 <= 465)
-                                {
-                                    unique_id = 9622108777680582053ull;
-                                }
-                                else
-                                {
-                                    unique_id = 13257779901106960809ull;
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d3 <= 121)
-                    {
-                        if(d4 <= 483)
-                        {
-                            if(d6 <= 29)
-                            {
-                                if(d5 <= 32)
-                                {
-                                    unique_id = 9622108777680582053ull;
-                                }
-                                else
-                                {
-                                    unique_id = 222393107113976106ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d5 <= 39)
-                                {
-                                    unique_id = 222393107113976106ull;
-                                }
-                                else
-                                {
-                                    if(d2 <= 152)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 13257779901106960809ull;
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d3 <= 37)
-                            {
-                                unique_id = 222393107113976106ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 29)
-                                {
-                                    if(d5 <= 32)
-                                    {
-                                        unique_id = 9622108777680582053ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 15066925687960442338ull;
-                                    }
-                                }
-                                else
-                                {
-                                    unique_id = 15066925687960442338ull;
-                                }
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d4 <= 135)
-                        {
-                            if(d3 <= 413)
-                            {
-                                if(d6 <= 30)
-                                {
-                                    if(d5 <= 32)
-                                    {
-                                        unique_id = 9622108777680582053ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                }
-                                else
-                                {
-                                    if(d5 <= 39)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 13257779901106960809ull;
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                if(d4 <= 36)
-                                {
-                                    unique_id = 222393107113976106ull;
-                                }
-                                else
-                                {
-                                    if(d2 <= 120)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        if(d6 <= 32)
-                                        {
-                                            if(d5 <= 32)
-                                            {
-                                                unique_id = 13257779901106960809ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 15066925687960442338ull;
-                                            }
-                                        }
-                                        else
-                                        {
-                                            unique_id = 15066925687960442338ull;
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d2 <= 115)
-                            {
-                                if(d6 <= 40)
-                                {
-                                    if(d2 <= 51)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        if(d5 <= 32)
-                                        {
-                                            unique_id = 9622108777680582053ull;
-                                        }
-                                        else
-                                        {
-                                            if(d4 <= 486)
-                                            {
-                                                unique_id = 222393107113976106ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 15066925687960442338ull;
-                                            }
-                                        }
-                                    }
-                                }
-                                else
-                                {
-                                    if(d1 <= 235)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        if(d2 <= 22)
-                                        {
-                                            unique_id = 222393107113976106ull;
-                                        }
-                                        else
-                                        {
-                                            unique_id = 15066925687960442338ull;
-                                        }
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                if(d6 <= 32)
-                                {
-                                    if(d5 <= 26)
-                                    {
-                                        if(d6 <= 23)
-                                        {
-                                            if(d1 <= 116)
-                                            {
-                                                unique_id = 9622108777680582053ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 13257779901106960809ull;
-                                            }
-                                        }
-                                        else
-                                        {
-                                            if(d5 <= 18)
-                                            {
-                                                unique_id = 13257779901106960809ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 15066925687960442338ull;
-                                            }
-                                        }
-                                    }
-                                    else
-                                    {
-                                        if(d5 <= 32)
-                                        {
-                                            if(d6 <= 16)
-                                            {
-                                                unique_id = 13257779901106960809ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 15066925687960442338ull;
-                                            }
-                                        }
-                                        else
-                                        {
-                                            unique_id = 15066925687960442338ull;
-                                        }
-                                    }
-                                }
-                                else
-                                {
-                                    unique_id = 15066925687960442338ull;
-                                }
-                            }
-                        }
-                    }
-                }
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
             }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            unique_id        = 15165261158317928321ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1268,7 +712,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, float>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, double>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1296,8 +740,8 @@ namespace hiptensor
             int d6 = a_ms_ks_lengths[3];
 
             size_t unique_id = 0;
-            // TODO select unique_id
-            unique_id = 11912251726020349830ull;
+
+            unique_id = 14511729289005214097ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1312,7 +756,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, float>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, double>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1339,8 +783,8 @@ namespace hiptensor
             int d6 = a_ms_ks_lengths[3];
 
             size_t unique_id = 0;
-            unique_id        = 15375432626310194825ull;
-            // TODO select unique_id
+
+            unique_id = 3636246152928348445ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1355,7 +799,12 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, double>
+    struct ActorCriticSelection<hipFloatComplex,
+                                hipFloatComplex,
+                                hipFloatComplex,
+                                hipFloatComplex,
+                                ContractionOpId_t::SCALE_COMPLEX,
+                                hipFloatComplex>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1384,238 +833,55 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            if(d5 <= 36)
+            unique_id = 5711776907278244209ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
-                if(d6 <= 35)
-                {
-                    if(d1 <= 763)
-                    {
-                        if(d6 <= 3)
-                        {
-                            if(d5 <= 8)
-                            {
-                                unique_id = 9769367948782541618ull;
-                            }
-                            else
-                            {
-                                unique_id = 3344638327382374968ull;
-                            }
-                        }
-                        else
-                        {
-                            unique_id = 3344638327382374968ull;
-                        }
-                    }
-                    else
-                    {
-                        if(d6 <= 24)
-                        {
-                            unique_id = 3344638327382374968ull;
-                        }
-                        else
-                        {
-                            if(d5 <= 17)
-                            {
-                                unique_id = 3344638327382374968ull;
-                            }
-                            else
-                            {
-                                unique_id = 2770278462698889442ull;
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d5 <= 9)
-                    {
-                        unique_id = 3344638327382374968ull;
-                    }
-                    else
-                    {
-                        if(d1 <= 759)
-                        {
-                            if(d6 <= 67)
-                            {
-                                if(d3 <= 535)
-                                {
-                                    unique_id = 3344638327382374968ull;
-                                }
-                                else
-                                {
-                                    if(d4 <= 615)
-                                    {
-                                        unique_id = 3344638327382374968ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 2770278462698889442ull;
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                if(d5 <= 25)
-                                {
-                                    if(d4 <= 428)
-                                    {
-                                        unique_id = 3344638327382374968ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 2770278462698889442ull;
-                                    }
-                                }
-                                else
-                                {
-                                    unique_id = 16588612317409292216ull;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d6 <= 64)
-                            {
-                                if(d3 <= 65)
-                                {
-                                    unique_id = 3344638327382374968ull;
-                                }
-                                else
-                                {
-                                    unique_id = 2770278462698889442ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d5 <= 25)
-                                {
-                                    unique_id = 2770278462698889442ull;
-                                }
-                                else
-                                {
-                                    unique_id = 16588612317409292216ull;
-                                }
-                            }
-                        }
-                    }
-                }
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
             }
             else
             {
-                if(d6 <= 33)
-                {
-                    if(d6 <= 8)
-                    {
-                        unique_id = 3344638327382374968ull;
-                    }
-                    else
-                    {
-                        if(d2 <= 565)
-                        {
-                            if(d1 <= 646)
-                            {
-                                unique_id = 3344638327382374968ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 27)
-                                {
-                                    unique_id = 3344638327382374968ull;
-                                }
-                                else
-                                {
-                                    if(d5 <= 53)
-                                    {
-                                        unique_id = 2770278462698889442ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 16588612317409292216ull;
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d6 <= 20)
-                            {
-                                if(d3 <= 168)
-                                {
-                                    unique_id = 3344638327382374968ull;
-                                }
-                                else
-                                {
-                                    unique_id = 2770278462698889442ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d5 <= 64)
-                                {
-                                    if(d1 <= 648)
-                                    {
-                                        unique_id = 3344638327382374968ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 2770278462698889442ull;
-                                    }
-                                }
-                                else
-                                {
-                                    if(d6 <= 25)
-                                    {
-                                        unique_id = 3344638327382374968ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 16588612317409292216ull;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d5 <= 45)
-                    {
-                        if(d6 <= 50)
-                        {
-                            if(d3 <= 168)
-                            {
-                                unique_id = 3344638327382374968ull;
-                            }
-                            else
-                            {
-                                unique_id = 2770278462698889442ull;
-                            }
-                        }
-                        else
-                        {
-                            unique_id = 16588612317409292216ull;
-                        }
-                    }
-                    else
-                    {
-                        if(d6 <= 43)
-                        {
-                            if(d5 <= 52)
-                            {
-                                unique_id = 2770278462698889442ull;
-                            }
-                            else
-                            {
-                                unique_id = 16588612317409292216ull;
-                            }
-                        }
-                        else
-                        {
-                            unique_id = 16588612317409292216ull;
-                        }
-                    }
-                }
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
             }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hipFloatComplex,
+                                hipFloatComplex,
+                                hipFloatComplex,
+                                hipFloatComplex,
+                                ContractionOpId_t::BILINEAR_COMPLEX,
+                                hipFloatComplex>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 355777364055884033ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1630,7 +896,12 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, double>
+    struct ActorCriticSelection<hipDoubleComplex,
+                                hipDoubleComplex,
+                                hipDoubleComplex,
+                                hipDoubleComplex,
+                                ContractionOpId_t::SCALE_COMPLEX,
+                                hipDoubleComplex>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1649,6 +920,7 @@ namespace hiptensor
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
                          const uint64_t                                          workspaceSize)
         {
+
             int d1 = a_ms_ks_lengths[0];
             int d2 = a_ms_ks_lengths[1];
             int d3 = b_ns_ks_lengths[0];
@@ -1658,217 +930,55 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            if(d5 <= 39)
+            unique_id = 3085227716611397774ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
-                if(d3 <= 937)
-                {
-                    if(d6 <= 1)
-                    {
-                        unique_id = 1830537384143755749ull;
-                    }
-                    else
-                    {
-                        if(d4 <= 754)
-                        {
-                            if(d5 <= 33)
-                            {
-                                if(d5 <= 1)
-                                {
-                                    if(d6 <= 25)
-                                    {
-                                        unique_id = 3423207643344265161ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 1830537384143755749ull;
-                                    }
-                                }
-                                else
-                                {
-                                    if(d6 <= 6)
-                                    {
-                                        if(d5 <= 8)
-                                        {
-                                            unique_id = 3423207643344265161ull;
-                                        }
-                                        else
-                                        {
-                                            unique_id = 1830537384143755749ull;
-                                        }
-                                    }
-                                    else
-                                    {
-                                        unique_id = 1830537384143755749ull;
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 1830537384143755749ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 404)
-                            {
-                                unique_id = 1830537384143755749ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 50)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    if(d5 <= 33)
-                                    {
-                                        unique_id = 1830537384143755749ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 4992687403741300893ull;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    unique_id = 1830537384143755749ull;
-                }
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
             }
             else
             {
-                if(d6 <= 32)
-                {
-                    if(d2 <= 832)
-                    {
-                        unique_id = 1830537384143755749ull;
-                    }
-                    else
-                    {
-                        if(d6 <= 8)
-                        {
-                            unique_id = 1830537384143755749ull;
-                        }
-                        else
-                        {
-                            if(d6 <= 24)
-                            {
-                                unique_id = 17689908062647780665ull;
-                            }
-                            else
-                            {
-                                if(d5 <= 64)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d6 <= 46)
-                    {
-                        if(d5 <= 54)
-                        {
-                            if(d1 <= 460)
-                            {
-                                unique_id = 1830537384143755749ull;
-                            }
-                            else
-                            {
-                                if(d5 <= 49)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 182)
-                            {
-                                if(d5 <= 65)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d2 <= 33)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d5 <= 49)
-                        {
-                            if(d6 <= 64)
-                            {
-                                if(d1 <= 411)
-                                {
-                                    if(d2 <= 396)
-                                    {
-                                        unique_id = 1830537384143755749ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 4992687403741300893ull;
-                                    }
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 4992687403741300893ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d2 <= 53)
-                            {
-                                if(d1 <= 222)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 4992687403741300893ull;
-                            }
-                        }
-                    }
-                }
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
             }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hipDoubleComplex,
+                                hipDoubleComplex,
+                                hipDoubleComplex,
+                                hipDoubleComplex,
+                                ContractionOpId_t::BILINEAR_COMPLEX,
+                                hipDoubleComplex>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 2196983681630807584ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1901,7 +1011,7 @@ namespace hiptensor
                          const uint64_t                                          workspaceSize)
     {
         if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F
-           && computeType == HIP_R_32F)
+           && computeType == HIPTENSOR_COMPUTE_32F)
         {
             return ActorCriticSelection<_Float16,
                                         _Float16,
@@ -1925,7 +1035,7 @@ namespace hiptensor
                                                              workspaceSize);
         }
         else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F
-                && computeType == HIP_R_32F)
+                && computeType == HIPTENSOR_COMPUTE_32F)
         {
             return ActorCriticSelection<_Float16,
                                         _Float16,
@@ -1949,7 +1059,7 @@ namespace hiptensor
                                                              workspaceSize);
         }
         else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE
-                && typeE == HIP_R_16BF && computeType == HIP_R_32F)
+                && typeE == HIP_R_16BF && computeType == HIPTENSOR_COMPUTE_32F)
         {
             return ActorCriticSelection<hip_bfloat16,
                                         hip_bfloat16,
@@ -1973,7 +1083,7 @@ namespace hiptensor
                                                              workspaceSize);
         }
         else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF
-                && typeE == HIP_R_16BF && computeType == HIP_R_32F)
+                && typeE == HIP_R_16BF && computeType == HIPTENSOR_COMPUTE_32F)
         {
             return ActorCriticSelection<hip_bfloat16,
                                         hip_bfloat16,
@@ -1997,7 +1107,7 @@ namespace hiptensor
                                                              workspaceSize);
         }
         else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
-                && computeType == HIP_R_16F)
+                && computeType == HIPTENSOR_COMPUTE_16F)
         {
             return ActorCriticSelection<float,
                                         float,
@@ -2021,7 +1131,7 @@ namespace hiptensor
                                                                 workspaceSize);
         }
         else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
-                && computeType == HIP_R_16F)
+                && computeType == HIPTENSOR_COMPUTE_16F)
         {
             return ActorCriticSelection<float,
                                         float,
@@ -2093,7 +1203,7 @@ namespace hiptensor
                                                                     workspaceSize);
         }
         else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
-                && computeType == HIP_R_32F)
+                && computeType == HIPTENSOR_COMPUTE_32F)
         {
             return ActorCriticSelection<float,
                                         float,
@@ -2117,7 +1227,7 @@ namespace hiptensor
                                                              workspaceSize);
         }
         else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
-                && computeType == HIP_R_32F)
+                && computeType == HIPTENSOR_COMPUTE_32F)
         {
             return ActorCriticSelection<float,
                                         float,
@@ -2141,7 +1251,7 @@ namespace hiptensor
                                                              workspaceSize);
         }
         else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F
-                && computeType == HIP_R_32F)
+                && computeType == HIPTENSOR_COMPUTE_32F)
         {
             return ActorCriticSelection<double,
                                         double,
@@ -2165,7 +1275,7 @@ namespace hiptensor
                                                              workspaceSize);
         }
         else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F
-                && computeType == HIP_R_32F)
+                && computeType == HIPTENSOR_COMPUTE_32F)
         {
             return ActorCriticSelection<double,
                                         double,
@@ -2189,7 +1299,7 @@ namespace hiptensor
                                                              workspaceSize);
         }
         else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F
-                && computeType == HIP_R_64F)
+                && computeType == HIPTENSOR_COMPUTE_64F)
         {
             return ActorCriticSelection<double,
                                         double,
@@ -2213,7 +1323,7 @@ namespace hiptensor
                                                               workspaceSize);
         }
         else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F
-                && computeType == HIP_R_64F)
+                && computeType == HIPTENSOR_COMPUTE_64F)
         {
             return ActorCriticSelection<double,
                                         double,
@@ -2236,6 +1346,102 @@ namespace hiptensor
                                                               e_ms_ns_strides,
                                                               workspaceSize);
         }
+        else if(typeA == HIP_C_32F && typeB == HIP_C_32F && typeD == NONE_TYPE && typeE == HIP_C_32F
+                && computeType == HIPTENSOR_COMPUTE_C32F)
+        {
+            return ActorCriticSelection<hipFloatComplex,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        ContractionOpId_t::SCALE_COMPLEX,
+                                        hipFloatComplex>::selectWinner(winner,
+                                                                       candidates,
+                                                                       typeA,
+                                                                       a_ms_ks_lengths,
+                                                                       a_ms_ks_strides,
+                                                                       typeB,
+                                                                       b_ns_ks_lengths,
+                                                                       b_ns_ks_strides,
+                                                                       typeD,
+                                                                       d_ms_ns_lengths,
+                                                                       d_ms_ns_strides,
+                                                                       typeE,
+                                                                       e_ms_ns_lengths,
+                                                                       e_ms_ns_strides,
+                                                                       workspaceSize);
+        }
+        else if(typeA == HIP_C_32F && typeB == HIP_C_32F && typeD == HIP_C_32F && typeE == HIP_C_32F
+                && computeType == HIPTENSOR_COMPUTE_C32F)
+        {
+            return ActorCriticSelection<hipFloatComplex,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        ContractionOpId_t::BILINEAR_COMPLEX,
+                                        hipFloatComplex>::selectWinner(winner,
+                                                                       candidates,
+                                                                       typeA,
+                                                                       a_ms_ks_lengths,
+                                                                       a_ms_ks_strides,
+                                                                       typeB,
+                                                                       b_ns_ks_lengths,
+                                                                       b_ns_ks_strides,
+                                                                       typeD,
+                                                                       d_ms_ns_lengths,
+                                                                       d_ms_ns_strides,
+                                                                       typeE,
+                                                                       e_ms_ns_lengths,
+                                                                       e_ms_ns_strides,
+                                                                       workspaceSize);
+        }
+        else if(typeA == HIP_C_64F && typeB == HIP_C_64F && typeD == NONE_TYPE && typeE == HIP_C_64F
+                && computeType == HIPTENSOR_COMPUTE_C64F)
+        {
+            return ActorCriticSelection<hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        ContractionOpId_t::SCALE_COMPLEX,
+                                        hipDoubleComplex>::selectWinner(winner,
+                                                                        candidates,
+                                                                        typeA,
+                                                                        a_ms_ks_lengths,
+                                                                        a_ms_ks_strides,
+                                                                        typeB,
+                                                                        b_ns_ks_lengths,
+                                                                        b_ns_ks_strides,
+                                                                        typeD,
+                                                                        d_ms_ns_lengths,
+                                                                        d_ms_ns_strides,
+                                                                        typeE,
+                                                                        e_ms_ns_lengths,
+                                                                        e_ms_ns_strides,
+                                                                        workspaceSize);
+        }
+        else if(typeA == HIP_C_64F && typeB == HIP_C_64F && typeD == HIP_C_64F && typeE == HIP_C_64F
+                && computeType == HIPTENSOR_COMPUTE_C64F)
+        {
+            return ActorCriticSelection<hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        ContractionOpId_t::BILINEAR_COMPLEX,
+                                        hipDoubleComplex>::selectWinner(winner,
+                                                                        candidates,
+                                                                        typeA,
+                                                                        a_ms_ks_lengths,
+                                                                        a_ms_ks_strides,
+                                                                        typeB,
+                                                                        b_ns_ks_lengths,
+                                                                        b_ns_ks_strides,
+                                                                        typeD,
+                                                                        d_ms_ns_lengths,
+                                                                        d_ms_ns_strides,
+                                                                        typeE,
+                                                                        e_ms_ns_lengths,
+                                                                        e_ms_ns_strides,
+                                                                        workspaceSize);
+        }
         return HIPTENSOR_STATUS_EXECUTION_FAILED;
     }
 }
diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp
index c1d4a3af..4820274f 100644
--- a/library/src/permutation/permutation_cpu_reference_impl.hpp
+++ b/library/src/permutation/permutation_cpu_reference_impl.hpp
@@ -92,7 +92,7 @@ namespace hiptensor
                 auto bOffset
                     = std::inner_product(bIndices.rbegin(), bIndices.rend(), bStrides.rbegin(), 0);
 #endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
-                B[bOffset] = static_cast<DataType>(A[elementIndex] * alphaValue);
+                B[bOffset] = static_cast<DataType>(A[elementIndex] * (DataType)alphaValue);
             }
 
             return HIPTENSOR_STATUS_SUCCESS;
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index 1e7999fc..9306445a 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -11,7 +11,7 @@ Tensor Data Types:
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
-  # - HIPTENSOR_ALGO_ACTOR_CRITIC
+  - HIPTENSOR_ALGO_ACTOR_CRITIC
 Operators:
   - HIPTENSOR_OP_IDENTITY
 Worksize Prefs:
@@ -29,7 +29,7 @@ Betas:
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
-  - [ 24, 18, 2, 4, 9, 2 ]
+  - [ 24, 18, 2, 4, 9, 1 ]
 Strides:
   - []
 ...
diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml
index b9fe7876..dfbb814e 100644
--- a/test/01_contraction/configs/complex_bilinear_test_params.yaml
+++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml
@@ -6,7 +6,7 @@ Tensor Data Types:
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
-  # - HIPTENSOR_ALGO_ACTOR_CRITIC
+  - HIPTENSOR_ALGO_ACTOR_CRITIC
 Operators:
   - HIPTENSOR_OP_IDENTITY
 Worksize Prefs:
@@ -24,7 +24,7 @@ Betas:
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
-  - [ 24, 18, 2, 4, 9, 2 ]
+  - [ 24, 18, 2, 4, 9, 1 ]
 Strides:
   - []
 ...
diff --git a/test/01_contraction/configs/complex_scale_test_params.yaml b/test/01_contraction/configs/complex_scale_test_params.yaml
index 355a5050..4bad2a9b 100644
--- a/test/01_contraction/configs/complex_scale_test_params.yaml
+++ b/test/01_contraction/configs/complex_scale_test_params.yaml
@@ -6,7 +6,7 @@ Tensor Data Types:
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
-  # - HIPTENSOR_ALGO_ACTOR_CRITIC
+  - HIPTENSOR_ALGO_ACTOR_CRITIC
 Operators:
   - HIPTENSOR_OP_IDENTITY
 Worksize Prefs:
@@ -24,7 +24,7 @@ Betas:
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
-  - [ 24, 18, 2, 4, 9, 2 ]
+  - [ 24, 18, 2, 4, 9, 1 ]
 Strides:
   - []
 ...
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index bc8289f5..4c52eeda 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -11,7 +11,7 @@ Tensor Data Types:
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
-  # - HIPTENSOR_ALGO_ACTOR_CRITIC
+  - HIPTENSOR_ALGO_ACTOR_CRITIC
 Operators:
   - HIPTENSOR_OP_IDENTITY
 Worksize Prefs:
@@ -29,7 +29,7 @@ Betas:
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
-  - [ 24, 18, 2, 4, 9, 2 ]
+  - [ 24, 18, 2, 4, 9, 1 ]
 Strides:
   - []
 ...
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index a75cf7bf..664da2ec 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -628,8 +628,8 @@ namespace hiptensor
                                                                 DDataType,
                                                                 workspace));
 
-            size_t elementsCD = std::accumulate(c_ms_ns.mLengths.begin(),
-                                                c_ms_ns.mLengths.end(),
+            size_t elementsCD = std::accumulate(d_ms_ns.mLengths.begin(),
+                                                d_ms_ns.mLengths.end(),
                                                 size_t{1},
                                                 std::multiplies<size_t>());
 
@@ -639,8 +639,11 @@ namespace hiptensor
 
             if(DDataType == HIP_R_16F)
             {
-                std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>(
-                    (_Float16*)resource->deviceD().get(), (_Float16*)reference.get(), elementsCD);
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(),
+                                                         (_Float16*)reference.get(),
+                                                         elementsCD,
+                                                         computeType);
             }
             else if(DDataType == HIP_R_16BF)
             {
@@ -648,17 +651,24 @@ namespace hiptensor
                     = compareEqualLaunchKernel<hip_bfloat16>(
                         (hip_bfloat16*)resource->deviceD().get(),
                         (hip_bfloat16*)reference.get(),
-                        elementsCD);
+                        elementsCD,
+                        computeType);
             }
             else if(DDataType == HIP_R_32F || DDataType == HIP_C_32F)
             {
-                std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<float>(
-                    (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD);
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<float>((float*)resource->deviceD().get(),
+                                                      (float*)reference.get(),
+                                                      elementsCD,
+                                                      computeType);
             }
             else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F)
             {
-                std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<double>(
-                    (double*)resource->deviceD().get(), (double*)reference.get(), elementsCD);
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<double>((double*)resource->deviceD().get(),
+                                                       (double*)reference.get(),
+                                                       elementsCD,
+                                                       computeType);
             }
 
             EXPECT_TRUE(mValidationResult) << "Max relative error: " << mMaxRelativeError;
diff --git a/test/02_permutation/permutation_cpu_impl_test.cpp b/test/02_permutation/permutation_cpu_impl_test.cpp
index 014dbc61..5a885f0b 100644
--- a/test/02_permutation/permutation_cpu_impl_test.cpp
+++ b/test/02_permutation/permutation_cpu_impl_test.cpp
@@ -125,7 +125,11 @@ auto permuteWithCpu(hipDataType typeA, hipDataType typeB, hipDataType typeComput
                                     &descB,
                                     modeB.data(),
                                     typeCompute);
-    return compareEqual(referenceArray.data(), bArray.data(), bArray.size(), 10);
+    return compareEqual(referenceArray.data(),
+                        bArray.data(),
+                        bArray.size(),
+                        hiptensor::convertToComputeType(typeCompute),
+                        10);
 }
 
 TEST(PermutationCpuImplTest, CompareF32ResultWithReference)
diff --git a/test/02_permutation/permutation_resource.cpp b/test/02_permutation/permutation_resource.cpp
index 1f448ff8..6acd7577 100644
--- a/test/02_permutation/permutation_resource.cpp
+++ b/test/02_permutation/permutation_resource.cpp
@@ -72,7 +72,7 @@ namespace hiptensor
             mCurrentAllocByte = requiredMemorySize;
             needFillData      = true;
         }
-        else if(mCurrentDataType != dataType)
+        if(mCurrentDataType != dataType || mCurrentMatrixElement < requiredElementCount)
         {
             needFillData = true;
         }
diff --git a/test/02_permutation/permutation_test.cpp b/test/02_permutation/permutation_test.cpp
index cfadf5c0..078c78a4 100644
--- a/test/02_permutation/permutation_test.cpp
+++ b/test/02_permutation/permutation_test.cpp
@@ -257,7 +257,8 @@ namespace hiptensor
                 std::tie(mValidationResult, mMaxRelativeError)
                     = compareEqualLaunchKernel<float>((float*)resource->deviceB().get(),
                                                       (float*)resource->deviceReference().get(),
-                                                      resource->getCurrentMatrixElement());
+                                                      resource->getCurrentMatrixElement(),
+                                                      convertToComputeType(computeDataType));
             }
             else if(abDataType == HIP_R_16F)
             {
@@ -273,7 +274,8 @@ namespace hiptensor
                 std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>(
                     (_Float16*)resource->deviceB().get(),
                     (_Float16*)resource->deviceReference().get(),
-                    resource->getCurrentMatrixElement());
+                    resource->getCurrentMatrixElement(),
+                    convertToComputeType(computeDataType));
             }
         }
 
diff --git a/test/utils.hpp b/test/utils.hpp
index ad4bb565..fc999738 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -57,6 +57,59 @@
         CHECK_HIP_ERROR(hipHostFree(ptr)); \
     }
 
+inline double getEpsilon(hiptensorComputeType_t id)
+{
+    auto toDouble = [](auto const& val) { return static_cast<double>(static_cast<float>(val)); };
+
+    if(id == HIPTENSOR_COMPUTE_16F)
+    {
+        return toDouble(std::numeric_limits<_Float16>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_16BF)
+    {
+        return toDouble(std::numeric_limits<hip_bfloat16>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_32F)
+    {
+        return toDouble(std::numeric_limits<float>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_64F)
+    {
+        return toDouble(std::numeric_limits<double>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_8U)
+    {
+        return 0;
+    }
+    else if(id == HIPTENSOR_COMPUTE_8I)
+    {
+        return 0;
+    }
+    else if(id == HIPTENSOR_COMPUTE_32U)
+    {
+        return 0;
+    }
+    else if(id == HIPTENSOR_COMPUTE_32I)
+    {
+        return 0;
+    }
+    else if(id == HIPTENSOR_COMPUTE_C32F)
+    {
+        return toDouble(std::numeric_limits<float>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_C64F)
+    {
+        return toDouble(std::numeric_limits<double>::epsilon());
+    }
+    else
+    {
+#if !NDEBUG
+        std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
+#endif // !NDEBUG
+        return 0;
+    }
+}
+
 inline bool isF32Supported()
 {
     hipDevice_t     mHandle;
@@ -137,10 +190,11 @@ __host__ static inline void
 }
 
 template <typename DDataType>
-std::pair<bool, double> compareEqual(DDataType const* deviceD,
-                                     DDataType const* hostD,
-                                     std::size_t      elementsD,
-                                     double           tolerance = 100.0)
+std::pair<bool, double> compareEqual(DDataType const*       deviceD,
+                                     DDataType const*       hostD,
+                                     std::size_t            elementsD,
+                                     hiptensorComputeType_t computeType,
+                                     double                 tolerance = 100.0)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -191,7 +245,7 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
         }
     }
 
-    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
+    auto eps = getEpsilon(computeType);
     if(isInf)
     {
         retval             = false;
@@ -211,10 +265,11 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
 }
 
 template <typename DDataType>
-std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
-                                                 DDataType*  hostD,
-                                                 std::size_t elementsD,
-                                                 double      tolerance = 100.0)
+std::pair<bool, double> compareEqualLaunchKernel(DDataType*             deviceD,
+                                                 DDataType*             hostD,
+                                                 std::size_t            elementsD,
+                                                 hiptensorComputeType_t computeType,
+                                                 double                 tolerance = 100.0)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);
@@ -276,7 +331,7 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
-    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
+    auto eps = getEpsilon(computeType);
     if(isNaN)
     {
         retval           = false;