diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 95e8b2ba..0e0a252e 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -6,7 +6,7 @@
 version: 2
 updates:
   - package-ecosystem: "pip" # See documentation for possible values
-    directory: "/docs/.sphinx" # Location of package manifests
+    directory: "/docs/sphinx" # Location of package manifests
     open-pull-requests-limit: 10
     schedule:
       interval: "daily"
diff --git a/.gitignore b/.gitignore
index ad44a303..9945a9dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,13 +50,3 @@ build*
 \#*\#
 *~
 *.log
-
-# documentation artifacts
-build/
-_build/
-_images/
-_static/
-_templates/
-_toc.yml
-docBin/
-_doxygen/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index e2bf130c..9e6678ab 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,11 +10,9 @@ formats: [htmlzip, pdf, epub]
 
 python:
    install:
-   - requirements: docs/.sphinx/requirements.txt
+   - requirements: docs/sphinx/requirements.txt
 
 build:
-   os: ubuntu-20.04
+   os: ubuntu-22.04
    tools:
       python: "3.8"
-   apt_packages:
-     - "doxygen"
diff --git a/README.md b/README.md
index f5e55943..5af7912d 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ Run the steps below to build documentation locally.
 ```shell
 cd docs
 
-pip3 install -r .sphinx/requirements.txt
+pip3 install -r sphinx/requirements.txt
 
 python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 ```
@@ -98,21 +98,24 @@ After configuration, build with `cmake --build <build_dir> -- -j<nproc>`
 ### Logger tests
 
 Tests API implementation of logger verbosity and functionality.
-o <build_dir>/bin/logger_test
+
+* `<build_dir>/bin/logger_test`
 
 ## Running Contraction Tests
 
 ### Bilinear contraction tests
 
 Tests the API implementation of bilinear contraction algorithm with validation.
-o <build_dir>/bin/bilinear_contraction_f32_test
-o <build_dir>/bin/bilinear_contraction_f64_test
+
+* `<build_dir>/bin/bilinear_contraction_f32_test`
+* `<build_dir>/bin/bilinear_contraction_f64_test`
 
 ### Scale contraction tests
 
 Tests the API implementation of scale contraction algorithm with validation.
-o <build_dir>/bin/scale_contraction_f32_test
-o <build_dir>/bin/scale_contraction_f64_test
+
+* `<build_dir>/bin/scale_contraction_f32_test`
+* `<build_dir>/bin/scale_contraction_f64_test`
 
 ### Samples
 
@@ -121,12 +124,14 @@ These are stand-alone use-cases of the hipTensor contraction operations.
 ## F32 Bilinear contraction
 
 Demonstrates the API implementation of bilinear contraction operation without validation.
-o <build_dir>/bin/simple_contraction_bilinear_f32
+
+* `<build_dir>/bin/simple_contraction_bilinear_f32`
 
 ## F32 Scale contraction
 
 Demonstrates the API implementation of scale contraction operation without validation.
-o <build_dir>/bin/simple_contraction_scale_f32
+
+* `<build_dir>/bin/simple_contraction_scale_f32`
 
 ### Build Samples as external client
 
diff --git a/docs/.gitignore b/docs/.gitignore
index a44ccbe0..594c0c8c 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,7 +1,5 @@
-.doxygen/docBin
-.sphinx/_toc.yml
-_build
-_doxygen
-_images
-_static
-_templates
\ No newline at end of file
+doxygen/html
+doxygen/xml
+sphinx/_toc.yml
+_build/
+_doxygen/
diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in
deleted file mode 100644
index 313c5e94..00000000
--- a/docs/.sphinx/requirements.in
+++ /dev/null
@@ -1 +0,0 @@
-rocm-docs-core>=0.24.0
diff --git a/docs/API_Reference_Guide.rst b/docs/API_Reference_Guide.rst
index 551e2ee0..77e86343 100644
--- a/docs/API_Reference_Guide.rst
+++ b/docs/API_Reference_Guide.rst
@@ -3,15 +3,16 @@
 Introduction
 ************
 
-hiptensor Data Types
+hipTensor Data Types
 ====================
 
+.. <!-- spellcheck-disable -->
+
 hiptensorStatus_t
 -----------------
 
 .. doxygenenum::  hiptensorStatus_t
 
-
 hiptensorComputeType_t
 ----------------------
 
@@ -160,3 +161,5 @@ hiptensorLoggerForceDisable
 ---------------------------
 
 .. doxygenfunction::  hiptensorLoggerForceDisable
+
+.. <!-- spellcheck-enable -->
diff --git a/docs/Contributors_Guide.rst b/docs/Contributors_Guide.rst
index aeb87211..212248be 100644
--- a/docs/Contributors_Guide.rst
+++ b/docs/Contributors_Guide.rst
@@ -15,8 +15,7 @@ License Agreement
 Pull-request guidelines
 =======================
 
-
-Our code contriubtion guidelines closely follows the model of `GitHub
+Our code contribution guidelines closely follows the model of `GitHub
 pull-requests <https://help.github.com/articles/using-pull-requests/>`__.
 The hipTensor repository follows a workflow which dictates a /master branch where releases are cut, and a
 /develop branch which serves as an integration branch for new code. Pull requests should:
@@ -30,8 +29,8 @@ The hipTensor repository follows a workflow which dictates a /master branch wher
 -  code must also have benchmark tests, and performance must approach
    the compute bound limit or memory bound limit.
 
-StyleGuide
-==========
+Style Guide
+===========
 
 This project follows the `CPP Core
 guidelines <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md>`__,
@@ -44,7 +43,7 @@ Interface
 ---------
 
 -  Library code should use C++17
--  Avoid CamelCase
+-  Avoid Camel case
 -  This rule applies specifically to publicly visible APIs, but is also
    encouraged (not mandated) for internal code
 
@@ -52,8 +51,8 @@ Philosophy
 ----------
 
 -  `P.2 <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rp-Cplusplus>`__:
-   Write in ISO Standard C++14 (especially to support windows, linux and
-   macos plaforms )
+   Write in ISO Standard C++14 (especially to support Windows, Linux and
+   macOS platforms )
 -  `P.5 <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rp-compile-time>`__:
    Prefer compile-time checking to run-time checking
 
@@ -105,19 +104,19 @@ will result in different results.
 
 To format a file, use:
 
-::
+.. code-block::
 
-    /opt/rocm/llvm/bin/clang-format -style=file -i <path-to-source-file>
+   /opt/rocm/llvm/bin/clang-format -style=file -i <path-to-source-file>
 
 To format all files, run the following script in hipTensor directory:
 
-::
+.. code-block::
 
-    #!/bin/bash
-    git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i
+   #!/bin/bash
+   git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i
 
 Also, githooks can be installed to format the code per-commit:
 
-::
+.. code-block::
 
-    ./.githooks/install
+   ./.githooks/install
diff --git a/docs/Linux_Install_Guide.rst b/docs/Linux_Install_Guide.rst
index 47cdc339..ace565c1 100644
--- a/docs/Linux_Install_Guide.rst
+++ b/docs/Linux_Install_Guide.rst
@@ -104,9 +104,9 @@ Minimum ROCm version support is 5.7.
 
 By default, the project is configured as Release mode.
 
-To build only library, run the following comomand :
+To build only library, run the following command :
 
-    CC=hipcc CXX=hipcc cmake -B<build_dir> . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=OFF
+    :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=OFF`
 
 Here are some other example project configurations:
 
@@ -116,30 +116,30 @@ Here are some other example project configurations:
 +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+
 |         Configuration             |                                          Command                                                                   |
 +===================================+====================================================================================================================+
-|            Basic                  |                                CC=hipcc CXX=hipcc cmake -B<build_dir> .                                            |
+|            Basic                  |                        :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> .`                                            |
 +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+
-|        Targeting gfx908           |                   CC=hipcc CXX=hipcc cmake -B<build_dir> . -DAMDGPU_TARGETS=gfx908:xnack-                          |
+|        Targeting gfx908           |           :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> . -DAMDGPU_TARGETS=gfx908:xnack-`                          |
 +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+
-|          Debug build              |                    CC=hipcc CXX=hipcc cmake -B<build_dir> . -DCMAKE_BUILD_TYPE=Debug                               |
+|          Debug build              |                    :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> . -DCMAKE_BUILD_TYPE=Debug`                       |
 +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+
 
 After configuration, build with
 
-    cmake --build <build_dir> -- -j
+    :code:`cmake --build <build_dir> -- -j`
 
 
 Build library + samples
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-To build library and samples, run the following comomand :
+To build library and samples, run the following command:
 
-    CC=hipcc CXX=hipcc cmake -B<build_dir> . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=ON
+    :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=ON`
 
 After configuration, build with
 
-    cmake --build <build_dir> -- -j
+    :code:`cmake --build <build_dir> -- -j`
 
-The samples folder in <build_dir> contains executables in the table below.
+The samples folder in :code:`<build_dir>` contains executables in the table below.
 
 =================================== ===================================================================================
 executable name                     description
@@ -154,13 +154,13 @@ Build library + tests
 
 To build library and tests, run the following command :
 
-    CC=hipcc CXX=hipcc cmake -B<build_dir> .
+    :code:`CC=hipcc CXX=hipcc cmake -B<build_dir> .`
 
 After configuration, build with
 
-    cmake --build <build_dir> -- -j
+    :code:`cmake --build <build_dir> -- -j`
 
-The tests in <build_dir> contains executables in the table below.
+The tests in `<build_dir>` contains executables in the table below.
 
 ====================================== ===================================================================================
 executable name                        description
@@ -177,6 +177,7 @@ Build library + Documentation
 
 Run the steps below to build documentation locally.
 
+.. code-block::
     cd docs
 
     sudo apt-get update
@@ -191,4 +192,4 @@ Run the steps below to build documentation locally.
 
     pdflatex hiptensor.tex
 
-Generates hiptensor.pdf here
+Generates :code:`hiptensor.pdf` here
diff --git a/docs/Programmers_Guide.rst b/docs/Programmers_Guide.rst
index 460bb970..047c1f5a 100644
--- a/docs/Programmers_Guide.rst
+++ b/docs/Programmers_Guide.rst
@@ -1,4 +1,3 @@
-
 ===================
 Programmer's Guide
 ===================
@@ -17,84 +16,84 @@ The hipTensor code is split into four major parts:
 The `library` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-library/include/hiptensor/
-'''''''''''''''''''''''''''
+`library/include/hiptensor/`
+''''''''''''''''''''''''''''
 
 Contains C++ include files for the hipTensor API. These files also contain Doxygen
 comments that document the API.
 
-library/include/hiptensor/internal
-''''''''''''''''''''''''''''''''''
+`library/include/hiptensor/internal`
+''''''''''''''''''''''''''''''''''''
 
 Internal include files for:
 
 - Utility Code
 - Generate Tensor Utility
 
-library/src/
-''''''''''''
+`library/src/`
+''''''''''''''
 
 Contains logger, device and performance functions.
 
-library/src/contraction/
-''''''''''''''''''''''''
+`library/src/contraction/`
+''''''''''''''''''''''''''
 
 Contains hipTensor core composable kernel header functions and contraction initialization functions.
 
-library/src/contraction/device
-''''''''''''''''''''''''''''''
+`library/src/contraction/device`
+''''''''''''''''''''''''''''''''
 
 Contains hipTensor Bilinear and Scale instance functions
 
 The `samples` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
-01_contraction/simple_bilinear_contraction_f32.cpp
-''''''''''''''''''''''''''''''''''''''''''''''''''
+`01_contraction/simple_bilinear_contraction_f32.cpp`
+''''''''''''''''''''''''''''''''''''''''''''''''''''
 
-sample code for calling bilinear contraction for fp32 input, output and compute types
+sample code for calling bilinear contraction for :code:`fp32` input, output and compute types
 
 
-01_contraction/simple_scale_contraction_f32.cpp
-'''''''''''''''''''''''''''''''''''''''''''''''
+`01_contraction/simple_scale_contraction_f32.cpp`
+'''''''''''''''''''''''''''''''''''''''''''''''''
 
-sample code for calling scale contraction for fp32 input, output and compute types
+sample code for calling scale contraction for :code:`fp32` input, output and compute types
 
 The `test` directory
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-00_unit/logger
-''''''''''''''
+`00_unit/logger`
+''''''''''''''''
 
 Test code for testing logger API Functions of hipTensor
 
-01_contraction/bilinear_contraction_f32
-'''''''''''''''''''''''''''''''''''''''
+`01_contraction/bilinear_contraction_f32`
+'''''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the bilinear contraction functionality and log metrics for F32 types.
 
-01_contraction/bilinear_contraction_f64
-'''''''''''''''''''''''''''''''''''''''
+`01_contraction/bilinear_contraction_f64`
+'''''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the bilinear contraction functionality and log metrics for F64 types.
 
-01_contraction/scale_contraction_f32
-''''''''''''''''''''''''''''''''''''
+`01_contraction/scale_contraction_f32`
+''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the scale contraction functionality and log metrics for F32 types.
 
-01_contraction/scale_contraction_f64
-''''''''''''''''''''''''''''''''''''
+`01_contraction/scale_contraction_f64`
+''''''''''''''''''''''''''''''''''''''
 
 Test code for testing the scale contraction functionality and log metrics for F64 types.
 
 Infrastructure
 ^^^^^^^^^^^^^^
 
-- CMake is used to build and package hipTensor. There are CMakeLists.txt files throughout the code.
-- Doxygen/Breathe/Sphinx/ReadTheDocs are used to produce documentation. Content for the documentation is from:
+- CMake is used to build and package hipTensor. There are :code:`CMakeLists.txt` files throughout the code.
+- `Doxygen/Breathe/Sphinx/ReadtheDocs` are used to produce documentation. Content for the documentation is from:
 
-  - Doxygen comments in include files in the directory library/include
-  - files in the directory docs/
+  - Doxygen comments in include files in the directory :code:`library/include`
+  - files in the directory :code:`docs/`
 
 - Jenkins is used to automate Continuous Integration testing.
-- clang-format is used to format C++ code.
+- :code:`clang-format` is used to format C++ code.
diff --git a/docs/conf.py b/docs/conf.py
index 4f00fb9e..e7e64d90 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,11 +29,31 @@
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 
+import re
+
 from rocm_docs import ROCmDocs
 
-docs_core = ROCmDocs("hipTensor Documentation")
-docs_core.run_doxygen()
+with open('../CMakeLists.txt', encoding='utf-8') as f:
+    match = re.search(r'.*\bset \( VERSION_STRING\s+\"?([0-9.]+)[^0-9.]+', f.read())
+    if not match:
+        raise ValueError("VERSION not found!")
+    version_number = match[1]
+left_nav_title = f"hipTensor {version_number} Documentation"
+
+# for PDF output on Read the Docs
+project = "hipTensor Documentation"
+author = "Advanced Micro Devices, Inc."
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
+version = version_number
+release = version_number
+
+external_toc_path = "./sphinx/_toc.yml"
+
+docs_core = ROCmDocs(left_nav_title)
+docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
 docs_core.setup()
 
+external_projects_current_project = "hiptensor"
+
 for sphinx_var in ROCmDocs.SPHINX_VARS:
     globals()[sphinx_var] = getattr(docs_core, sphinx_var)
diff --git a/docs/.doxygen/Doxyfile b/docs/doxygen/Doxyfile
similarity index 99%
rename from docs/.doxygen/Doxyfile
rename to docs/doxygen/Doxyfile
index 59a973b7..6f96968a 100644
--- a/docs/.doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -58,7 +58,7 @@ PROJECT_LOGO           =
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = docBin
+OUTPUT_DIRECTORY       = .
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -786,7 +786,8 @@ WARN_AS_ERROR          = YES
 
 INPUT                  = ../../library/include/hiptensor \
                          ../../library/include/hiptensor/internal \
-                         ../../library/src
+                         ../../library/src \
+                         ../../README.md
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -965,7 +966,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-USE_MDFILE_AS_MAINPAGE = ../README.md
+USE_MDFILE_AS_MAINPAGE = ../../README.md
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
@@ -2074,7 +2075,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = __device__
+PREDEFINED             = __device__ \
+                         DOXYGEN_SHOULD_SKIP_THIS
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/index.rst b/docs/index.rst
index 566a00e5..ba5e1cb7 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,6 +1,6 @@
 ============================================================================
-hiptensor: A High-Performance HIP Library For Tensor Primitives
+hipTensor: A High-Performance HIP Library For Tensor Primitives
 ============================================================================
 
-hiptensor is AMD's C++ library for accelerating tensor primitives based on the
+hipTensor is AMD's C++ library for accelerating tensor primitives based on the
 composable kernel library, through general purpose kernel languages, like HIP C++.
diff --git a/docs/license.rst b/docs/license.rst
new file mode 100644
index 00000000..141b5d3c
--- /dev/null
+++ b/docs/license.rst
@@ -0,0 +1,4 @@
+License
+=======
+
+.. include:: ../LICENSE
diff --git a/docs/.sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
similarity index 84%
rename from docs/.sphinx/_toc.yml.in
rename to docs/sphinx/_toc.yml.in
index 37b5a62b..6da76c27 100644
--- a/docs/.sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -8,3 +8,6 @@ subtrees:
     - file: API_Reference_Guide
     - file: Programmers_Guide
     - file: Contributors_Guide
+  - caption: About
+    entries:
+    - file: license
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
new file mode 100644
index 00000000..b80af261
--- /dev/null
+++ b/docs/sphinx/requirements.in
@@ -0,0 +1 @@
+rocm-docs-core==0.30.3
diff --git a/docs/.sphinx/requirements.txt b/docs/sphinx/requirements.txt
similarity index 96%
rename from docs/.sphinx/requirements.txt
rename to docs/sphinx/requirements.txt
index 94103e1a..81f0b559 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -26,7 +26,7 @@ charset-normalizer==3.1.0
     # via requests
 click==8.1.3
     # via sphinx-external-toc
-cryptography==41.0.4
+cryptography==41.0.6
     # via pyjwt
 deprecated==1.2.13
     # via pygithub
@@ -40,7 +40,7 @@ fastjsonschema==2.16.3
     # via rocm-docs-core
 gitdb==4.0.10
     # via gitpython
-gitpython==3.1.35
+gitpython==3.1.37
     # via rocm-docs-core
 idna==3.4
     # via requests
@@ -84,9 +84,7 @@ pygments==2.15.0
     #   pydata-sphinx-theme
     #   sphinx
 pyjwt[crypto]==2.6.0
-    # via
-    #   pygithub
-    #   pyjwt
+    # via pygithub
 pynacl==1.5.0
     # via pygithub
 pytz==2023.3.post1
@@ -100,7 +98,7 @@ requests==2.31.0
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.28.0
+rocm-docs-core==0.30.3
     # via -r requirements.in
 smmap==5.0.0
     # via gitdb
@@ -143,7 +141,7 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 typing-extensions==4.5.0
     # via pydata-sphinx-theme
-urllib3==1.26.15
+urllib3==1.26.18
     # via requests
 wrapt==1.15.0
     # via deprecated
diff --git a/library/include/hiptensor/hiptensor_types.hpp b/library/include/hiptensor/hiptensor_types.hpp
index 85a5d90e..ca666a5b 100644
--- a/library/include/hiptensor/hiptensor_types.hpp
+++ b/library/include/hiptensor/hiptensor_types.hpp
@@ -90,6 +90,8 @@ typedef enum
     HIPTENSOR_COMPUTE_8I   = (1U << 8U),
     HIPTENSOR_COMPUTE_32U  = (1U << 7U),
     HIPTENSOR_COMPUTE_32I  = (1U << 9U),
+    HIPTENSOR_COMPUTE_C32F = (1U << 11U),
+    HIPTENSOR_COMPUTE_C64F = (1U << 12U),
     HIPTENSOR_COMPUTE_NONE = 0
 } hiptensorComputeType_t;
 
diff --git a/library/src/include/config.hpp b/library/include/hiptensor/internal/config.hpp
similarity index 100%
rename from library/src/include/config.hpp
rename to library/include/hiptensor/internal/config.hpp
diff --git a/library/include/hiptensor/internal/hiptensor-version.hpp.in b/library/include/hiptensor/internal/hiptensor-version.hpp.in
index e1942a2b..89247375 100644
--- a/library/include/hiptensor/internal/hiptensor-version.hpp.in
+++ b/library/include/hiptensor/internal/hiptensor-version.hpp.in
@@ -38,6 +38,15 @@
 #define HIPTENSOR_PATCH_VERSION       @hiptensor_VERSION_PATCH@
 // clang-format on
 
+/**
+ * \brief Returns the version number of hipTensor
+ *
+ * \details Return the version with three least significant digits for patch version,
+ * the next three digits for minor version, and the most significant digits for major version.
+ *
+ * \returns The version number.
+ */
+ 
 inline size_t hiptensorGetVersion()
 {
     return HIPTENSOR_MAJOR_VERSION * 1e6 + HIPTENSOR_MINOR_VERSION * 1e3 + HIPTENSOR_PATCH_VERSION;
diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp
index f2df2dd2..746f1bbf 100644
--- a/library/include/hiptensor/internal/hiptensor_utility.hpp
+++ b/library/include/hiptensor/internal/hiptensor_utility.hpp
@@ -29,8 +29,10 @@
 #include <fstream>
 #include <hip/hip_runtime.h>
 #include <iostream>
+#include <hip/hip_complex.h>
 
 #include "../hiptensor_types.hpp"
+#include "types_ext.hpp"
 
 #ifndef CHECK_HIP_ERROR
 #define CHECK_HIP_ERROR(expression)                      \
@@ -60,6 +62,20 @@
     }
 #endif
 
+inline std::ostream& operator<<(std::ostream& os, const hipFloatComplex& fc)
+{
+    std::string seperator = (hipCimagf(fc) >= 0) ? " + " : "";
+
+    return os << hipCrealf(fc) << seperator << hipCimagf(fc) << "i";
+}
+
+inline std::ostream& operator<<(std::ostream& os, const hipDoubleComplex& dc)
+{
+    std::string seperator = (hipCimag(dc) >= 0) ? " + " : "";
+
+    return os << hipCreal(dc) << seperator << hipCimag(dc) << "i";
+}
+
 template <typename T>
 void hiptensorPrintArrayElements(std::ostream& stream, T* vec, size_t size)
 {
diff --git a/library/src/include/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp
similarity index 97%
rename from library/src/include/native_types.hpp
rename to library/include/hiptensor/internal/native_types.hpp
index 6c9dbee8..69ce706f 100644
--- a/library/src/include/native_types.hpp
+++ b/library/include/hiptensor/internal/native_types.hpp
@@ -33,8 +33,6 @@
 #include <type_traits>
 #include <utility>
 
-#include "xfloat32.hpp"
-
 namespace hiptensor
 {
 
@@ -84,9 +82,6 @@ namespace hiptensor
 #if !HIPTENSOR_NO_HALF
     using hfloat16_t = __half;
 #endif // !HIPTENSOR_NO_HALF
-
-    using xfloat32_t = hiptensor_xfloat32;
-
     // clang-format off
 
 
diff --git a/library/src/include/native_types_impl.hpp b/library/include/hiptensor/internal/native_types_impl.hpp
similarity index 100%
rename from library/src/include/native_types_impl.hpp
rename to library/include/hiptensor/internal/native_types_impl.hpp
diff --git a/library/src/include/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp
similarity index 80%
rename from library/src/include/type_traits.hpp
rename to library/include/hiptensor/internal/type_traits.hpp
index 3867839d..81bafacd 100644
--- a/library/src/include/type_traits.hpp
+++ b/library/include/hiptensor/internal/type_traits.hpp
@@ -26,9 +26,11 @@
 
 #ifndef HIPTENSOR_TYPE_TRAITS_HPP
 #define HIPTENSOR_TYPE_TRAITS_HPP
-#include "native_types.hpp"
 #include <cfloat>
 
+#include "config.hpp"
+#include "native_types.hpp"
+
 namespace hiptensor
 {
     namespace detail
@@ -69,9 +71,8 @@ namespace hiptensor
         {
             union
             {
-                uint32_t   i32;
-                float32_t  f32;
-                xfloat32_t xf32;
+                uint32_t  i32;
+                float32_t f32;
             };
             constexpr Fp32Bits(uint32_t initVal)
                 : i32(initVal)
@@ -81,10 +82,6 @@ namespace hiptensor
                 : f32(initVal)
             {
             }
-            constexpr Fp32Bits(xfloat32_t initVal)
-                : xf32(initVal)
-            {
-            }
         };
 
     } // namespace detail
@@ -96,6 +93,7 @@ namespace std
     ///////////  std::numeric_limits<float16_t>  //////////////
     ///////////////////////////////////////////////////////////
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
     template <>
     HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t
         numeric_limits<hiptensor::float16_t>::epsilon() noexcept
@@ -273,68 +271,7 @@ namespace std
         hiptensor::detail::Fp16Bits eps(static_cast<uint16_t>(0x7FC0));
         return eps.b16;
     }
-
-    ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<xfloat32_t>  //////////////
-    ///////////////////////////////////////////////////////////
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::epsilon() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_EPSILON));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::infinity() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(HUGE_VALF));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::lowest() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(-FLT_MAX));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::max() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MAX));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::min() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<float>(FLT_MIN));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::quiet_NaN() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF80000));
-        return eps.xf32;
-    }
-
-    template <>
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t
-        numeric_limits<hiptensor::xfloat32_t>::signaling_NaN() noexcept
-    {
-        hiptensor::detail::Fp32Bits eps(static_cast<uint32_t>(0x7FF00000));
-        return eps.xf32;
-    }
-    // @endcond
-
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 } // namespace std
 
 namespace hiptensor
@@ -378,13 +315,6 @@ namespace hiptensor
         // b16 mantissa is 7 bits
         return ((int32_t)1 << 8);
     }
-
-    template <typename T, typename std::enable_if_t<std::is_same<T, xfloat32_t>::value, int> = 0>
-    constexpr auto maxExactInteger() -> int32_t
-    {
-        // xf32 mantissa is 7 bits
-        return ((int32_t)1 << 8);
-    }
 } // namespace hiptensor
 
 #endif // HIPTENSOR_TYPE_TRAITS_HPP
diff --git a/library/src/include/types.hpp b/library/include/hiptensor/internal/types.hpp
similarity index 100%
rename from library/src/include/types.hpp
rename to library/include/hiptensor/internal/types.hpp
diff --git a/library/src/include/types_ext.hpp b/library/include/hiptensor/internal/types_ext.hpp
similarity index 100%
rename from library/src/include/types_ext.hpp
rename to library/include/hiptensor/internal/types_ext.hpp
diff --git a/library/src/contraction/contraction_cpu_reference.cpp b/library/src/contraction/contraction_cpu_reference.cpp
index 13dcdffd..ac1d9711 100644
--- a/library/src/contraction/contraction_cpu_reference.cpp
+++ b/library/src/contraction/contraction_cpu_reference.cpp
@@ -28,31 +28,33 @@
 #include "contraction_cpu_reference_impl.hpp"
 #include "contraction_cpu_reference_instances.hpp"
 
-hiptensorStatus_t hiptensorContractionReference(void const*                alpha,
-                                                void const*                A,
-                                                void const*                B,
-                                                void const*                beta,
-                                                void const*                C,
-                                                void*                      D,
-                                                std::vector<size_t> const& a_ms_ks_lengths,
-                                                std::vector<size_t> const& a_ms_ks_strides,
-                                                std::vector<size_t> const& b_ns_ks_lengths,
-                                                std::vector<size_t> const& b_ns_ks_strides,
-                                                std::vector<size_t> const& c_ms_ns_lengths,
-                                                std::vector<size_t> const& c_ms_ns_strides,
-                                                std::vector<size_t> const& d_ms_ns_lengths,
-                                                std::vector<size_t> const& d_ms_ns_strides,
-                                                hipDataType                typeA,
-                                                hipDataType                typeB,
-                                                hipDataType                typeC,
-                                                hipDataType                typeD,
-                                                void*                      workspace)
+hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan,
+                                                void const*                       alpha,
+                                                void const*                       A,
+                                                void const*                       B,
+                                                void const*                       beta,
+                                                void const*                       C,
+                                                void*                             D,
+                                                std::vector<size_t> const&        a_ms_ks_lengths,
+                                                std::vector<size_t> const&        a_ms_ks_strides,
+                                                std::vector<size_t> const&        b_ns_ks_lengths,
+                                                std::vector<size_t> const&        b_ns_ks_strides,
+                                                std::vector<size_t> const&        c_ms_ns_lengths,
+                                                std::vector<size_t> const&        c_ms_ns_strides,
+                                                std::vector<size_t> const&        d_ms_ns_lengths,
+                                                std::vector<size_t> const&        d_ms_ns_strides,
+                                                hipDataType                       typeA,
+                                                hipDataType                       typeB,
+                                                hipDataType                       typeC,
+                                                hipDataType                       typeD,
+                                                void*                             workspace)
 {
-    auto& instances = hiptensor::ContractionCpuReferenceInstances::instance();
+    auto& instances   = hiptensor::ContractionCpuReferenceInstances::instance();
+    auto  computeType = plan->mContractionDesc.mComputeType;
     auto  candidates
-        = (C == nullptr)
-              ? instances->allSolutions().query(typeA, typeB, hiptensor::NONE_TYPE, typeD)
-              : instances->allSolutions().query(typeA, typeB, typeC, typeD);
+        = (C == nullptr) ? instances->allSolutions().query(
+              typeA, typeB, hiptensor::NONE_TYPE, typeD, computeType)
+                         : instances->allSolutions().query(typeA, typeB, typeC, typeD, computeType);
 
     auto toCKVec
         = [](auto& inputVec) { return std::vector<ck::index_t>(inputVec.begin(), inputVec.end()); };
diff --git a/library/src/contraction/contraction_cpu_reference.hpp b/library/src/contraction/contraction_cpu_reference.hpp
index aadb062e..471026dc 100644
--- a/library/src/contraction/contraction_cpu_reference.hpp
+++ b/library/src/contraction/contraction_cpu_reference.hpp
@@ -32,24 +32,25 @@
 
 #include <hiptensor/hiptensor.hpp>
 
-hiptensorStatus_t hiptensorContractionReference(void const*                alpha,
-                                                void const*                A,
-                                                void const*                B,
-                                                void const*                beta,
-                                                void const*                C,
-                                                void*                      D,
-                                                std::vector<size_t> const& a_ms_ks_lengths,
-                                                std::vector<size_t> const& a_ms_ks_strides,
-                                                std::vector<size_t> const& b_ks_ns_lengths,
-                                                std::vector<size_t> const& b_ks_ns_strides,
-                                                std::vector<size_t> const& c_ms_ns_lengths,
-                                                std::vector<size_t> const& c_ms_ns_strides,
-                                                std::vector<size_t> const& d_ms_ns_lengths,
-                                                std::vector<size_t> const& d_ms_ns_strides,
-                                                hipDataType                typeA,
-                                                hipDataType                typeB,
-                                                hipDataType                typeC,
-                                                hipDataType                typeD,
-                                                void*                      workspace);
+hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan,
+                                                void const*                       alpha,
+                                                void const*                       A,
+                                                void const*                       B,
+                                                void const*                       beta,
+                                                void const*                       C,
+                                                void*                             D,
+                                                std::vector<size_t> const&        a_ms_ks_lengths,
+                                                std::vector<size_t> const&        a_ms_ks_strides,
+                                                std::vector<size_t> const&        b_ks_ns_lengths,
+                                                std::vector<size_t> const&        b_ks_ns_strides,
+                                                std::vector<size_t> const&        c_ms_ns_lengths,
+                                                std::vector<size_t> const&        c_ms_ns_strides,
+                                                std::vector<size_t> const&        d_ms_ns_lengths,
+                                                std::vector<size_t> const&        d_ms_ns_strides,
+                                                hipDataType                       typeA,
+                                                hipDataType                       typeB,
+                                                hipDataType                       typeC,
+                                                hipDataType                       typeD,
+                                                void*                             workspace);
 
 #endif // HIPTENSOR_CONTRACTION_CPU_REFERENCE_HPP
diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp
index 673f6dff..2f031bb0 100644
--- a/library/src/contraction/contraction_cpu_reference_impl.hpp
+++ b/library/src/contraction/contraction_cpu_reference_impl.hpp
@@ -45,19 +45,25 @@
 namespace hiptensor
 {
     // hardcoded for NumDimM == NumDimN == NumDimK == 2
+    //
+    // ck::bhalf_t is ushort, cannot perform bhalf_t * bhalf_t
+    // CK does not use ck::bhalf_t as AccDataType. But we still
+    // add this guard here
     template <
         ck::index_t NumDimM,
         ck::index_t NumDimN,
         ck::index_t NumDimK,
         typename ADataType,
         typename BDataType,
+        typename AccDataType,
         typename DsDataType,
         typename EDataType,
-        typename AccDataType,
         typename AElementwiseOperation,
         typename BElementwiseOperation,
         typename CDEElementwiseOperation,
-        ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2 && DsDataType::Size() <= 1,
+        typename ComputeDataType = ADataType,
+        ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2 && DsDataType::Size() <= 1
+                            && !std::is_same_v<AccDataType, ck::bhalf_t>,
                         bool>
         = false>
     struct ReferenceContraction_M2_N2_K2
@@ -70,7 +76,8 @@ namespace hiptensor
                                                                           EDataType,
                                                                           AElementwiseOperation,
                                                                           BElementwiseOperation,
-                                                                          CDEElementwiseOperation>
+                                                                          CDEElementwiseOperation,
+                                                                          ComputeDataType>
     {
         using BaseArgument = ck::tensor_operation::device::BaseArgument;
         using BaseInvoker  = ck::tensor_operation::device::BaseInvoker;
@@ -149,57 +156,163 @@ namespace hiptensor
                         indices.begin(), indices.end(), strides.begin(), std::size_t{0});
                 };
 
-                auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                    auto accum = static_cast<AccDataType>(0);
-
-                    auto K0 = arg.mA_ms_ks_lengths[2];
-                    auto K1 = arg.mA_ms_ks_lengths[3];
-
-                    for(size_t k0 = 0; k0 < K0; k0++)
-                    {
-                        for(size_t k1 = 0; k1 < K1; k1++)
+                if constexpr((std::is_same_v<ADataType, hipFloatComplex> &&
+                              std::is_same_v<BDataType, hipFloatComplex> &&
+                              std::is_same_v<EDataType, hipFloatComplex>) ||
+                              (std::is_same_v<ADataType, hipDoubleComplex> &&
+                               std::is_same_v<BDataType, hipDoubleComplex> &&
+                               std::is_same_v<EDataType, hipDoubleComplex>))
+                {
+                    auto f_ms_ns_complex = [&](auto m0, auto m1, auto n0, auto n1) {
+                            HIP_vector_type<AccDataType, 2> accum{0};
+
+                            auto K0 = arg.mA_ms_ks_lengths[2];
+                            auto K1 = arg.mA_ms_ks_lengths[3];
+
+                            for(size_t k0 = 0; k0 < K0; k0++)
+                            {
+                                for(size_t k1 = 0; k1 < K1; k1++)
+                                {
+                                    auto indexA
+                                        = offset(std::vector<size_t>{m0, m1, k0, k1}, arg.mA_ms_ks_strides);
+                                    auto indexB
+                                        = offset(std::vector<size_t>{n0, n1, k0, k1}, arg.mB_ns_ks_strides);
+
+                                    ADataType valA = ((ADataType*)arg.mA)[indexA];
+                                    BDataType valB = ((BDataType*)arg.mB)[indexB];
+
+                                    // Mult / accum
+                                    if constexpr(std::is_same_v<AccDataType, float>)
+                                    {
+                                        accum = hipCaddf(accum, hipCmulf(valA, valB));
+                                    }
+                                    else if constexpr(std::is_same_v<AccDataType, double>)
+                                    {
+                                        accum = hipCadd(accum, hipCmul(valA, valB));
+                                    }
+                                }
+                            }
+
+                            auto indexE = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mE_ms_ns_strides);
+
+                            if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                        ck::tensor_operation::element_wise::Scale>)
+                            {
+                                ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.scale_ * (EDataType)accum;
+                            }
+                            else if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                             ck::tensor_operation::element_wise::ScaleComplex>)
+                            {
+                                if constexpr(std::is_same_v<EDataType, hipFloatComplex>)
+                                {
+                                    ((EDataType*)arg.mE)[indexE] = hipCmulf(hipComplexDoubleToFloat(arg.mOpCDE.scale_), (EDataType)accum);
+                                }
+                                else
+                                {
+                                    ((EDataType*)arg.mE)[indexE] = hipCmul(arg.mOpCDE.scale_, (EDataType)accum);
+                                }
+                            }
+                            else if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                             ck::tensor_operation::element_wise::Bilinear>)
+                            {
+                                // NumDTensor will be 1 due to SFINAE of this class
+                                auto indexD
+                                    = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
+
+                                ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.alpha_ * (EDataType)accum +
+                                                               arg.mOpCDE.beta_ * ((EDataType*)(arg.mD[0]))[indexD];
+                            }
+                            else if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                             ck::tensor_operation::element_wise::BilinearComplex>)
+                            {
+                                // NumDTensor will be 1 due to SFINAE of this class
+                                auto indexD
+                                    = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
+
+                                if constexpr(std::is_same_v<EDataType, hipFloatComplex>)
+                                {
+                                    ((EDataType*)arg.mE)[indexE] = hipCaddf(
+                                                                            hipCmulf(
+                                                                                    hipComplexDoubleToFloat(arg.mOpCDE.alpha_),
+                                                                                    (EDataType)accum),
+                                                                            hipCmulf(
+                                                                                    hipComplexDoubleToFloat(arg.mOpCDE.beta_),
+                                                                                    ((EDataType*)(arg.mD[0]))[indexD]));
+                                }
+                                else
+                                {
+                                    ((EDataType*)arg.mE)[indexE] = hipCadd(hipCmul(arg.mOpCDE.alpha_, (EDataType)accum),
+                                                                           hipCmul(arg.mOpCDE.beta_, ((EDataType*)(arg.mD[0]))[indexD]));
+                                }
+                            }
+                        };
+
+                    make_ParallelTensorFunctor(f_ms_ns_complex,
+                                               arg.mE_ms_ns_lengths[0],
+                                               arg.mE_ms_ns_lengths[1],
+                                               arg.mE_ms_ns_lengths[2],
+                                               arg.mE_ms_ns_lengths[3])(
+                        std::thread::hardware_concurrency());
+                }
+                else
+                {
+                    auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                        AccDataType accum = 0;
+
+                        auto K0 = arg.mA_ms_ks_lengths[2];
+                        auto K1 = arg.mA_ms_ks_lengths[3];
+
+                        for(size_t k0 = 0; k0 < K0; k0++)
                         {
-                            auto indexA
-                                = offset(std::vector<size_t>{m0, m1, k0, k1}, arg.mA_ms_ks_strides);
-                            auto indexB
-                                = offset(std::vector<size_t>{n0, n1, k0, k1}, arg.mB_ns_ks_strides);
-
-                            ADataType valA;
-                            BDataType valB;
+                            for(size_t k1 = 0; k1 < K1; k1++)
+                            {
+                                auto indexA
+                                    = offset(std::vector<size_t>{m0, m1, k0, k1}, arg.mA_ms_ks_strides);
+                                auto indexB
+                                    = offset(std::vector<size_t>{n0, n1, k0, k1}, arg.mB_ns_ks_strides);
+
+                                AccDataType valA;
+                                AccDataType valB;
+
+                                // Element-wise ops
+                                arg.mOpA(
+                                    valA,
+                                    ck::type_convert<ComputeDataType>(((ADataType*)arg.mA)[indexA]));
+                                arg.mOpB(
+                                    valB,
+                                    ck::type_convert<ComputeDataType>(((BDataType*)arg.mB)[indexB]));
+
+                                // Mult / accum
+                                accum += valA * valB;
+                            }
+                        }
 
-                            // Element-wise ops
-                            arg.mOpA(valA, ((ADataType*)arg.mA)[indexA]);
-                            arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]);
+                        auto indexE = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mE_ms_ns_strides);
 
-                            // Mult / accum
-                            accum
-                                += static_cast<AccDataType>(valA) * static_cast<AccDataType>(valB);
+                        if constexpr(std::is_same_v<CDEElementwiseOperation,
+                                                    ck::tensor_operation::element_wise::Scale>)
+                        {
+                            arg.mOpCDE(((EDataType*)arg.mE)[indexE],
+                                    ck::type_convert<EDataType>(accum));
                         }
-                    }
-
-                    auto indexE = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mE_ms_ns_strides);
-
-                    if constexpr(std::is_same_v<CDEElementwiseOperation,
-                                                ck::tensor_operation::element_wise::Scale>)
-                    {
-                        arg.mOpCDE(((EDataType*)arg.mE)[indexE], accum);
-                    }
-                    else // bilinear
-                    {
-                        // NumDTensor will be 1 due to SFINAE of this class
-                        auto indexD
-                            = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
-                        arg.mOpCDE(
-                            ((EDataType*)arg.mE)[indexE], accum, ((EDataType*)(arg.mD[0]))[indexD]);
-                    }
-                };
+                        else // bilinear
+                        {
+                            // NumDTensor will be 1 due to SFINAE of this class
+                            auto indexD
+                                = offset(std::vector<size_t>{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]);
+                            arg.mOpCDE(((EDataType*)arg.mE)[indexE],
+                                    ck::type_convert<EDataType>(accum),
+                                    ((EDataType*)(arg.mD[0]))[indexD]);
+                        }
+                    };
 
-                make_ParallelTensorFunctor(f_ms_ns,
-                                           arg.mE_ms_ns_lengths[0],
-                                           arg.mE_ms_ns_lengths[1],
-                                           arg.mE_ms_ns_lengths[2],
-                                           arg.mE_ms_ns_lengths[3])(
-                    std::thread::hardware_concurrency());
+                    make_ParallelTensorFunctor(f_ms_ns,
+                                               arg.mE_ms_ns_lengths[0],
+                                               arg.mE_ms_ns_lengths[1],
+                                               arg.mE_ms_ns_lengths[2],
+                                               arg.mE_ms_ns_lengths[3])(
+                        std::thread::hardware_concurrency());
+                }
 
                 return 0;
             }
@@ -319,23 +432,25 @@ namespace hiptensor
               ck::index_t NumDimsK,
               typename ADataType,
               typename BDataType,
+              typename AccDataType,
               typename DsDataType,
               typename EDataType,
-              typename AccumDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ReferenceContraction_M2_N2_K2<NumDimsM,
                                                     NumDimsN,
                                                     NumDimsK,
                                                     ADataType,
                                                     BDataType,
+                                                    AccDataType,
                                                     DsDataType,
                                                     EDataType,
-                                                    AccumDataType,
                                                     AElementwiseOperation,
                                                     BElementwiseOperation,
-                                                    CDEElementwiseOperation>>
+                                                    CDEElementwiseOperation,
+                                                    ComputeDataType>>
         : public MetaTraits<
               ck::tensor_operation::device::DeviceContractionMultipleD<NumDimsM,
                                                                        NumDimsN,
@@ -346,7 +461,8 @@ namespace hiptensor
                                                                        EDataType,
                                                                        AElementwiseOperation,
                                                                        BElementwiseOperation,
-                                                                       CDEElementwiseOperation>>
+                                                                       CDEElementwiseOperation,
+                                                                       ComputeDataType>>
     {
     };
 
@@ -355,11 +471,13 @@ namespace hiptensor
               ck::index_t NumDimK,
               typename ADataType,
               typename BDataType,
+              typename AccDataType,
               typename DsDataType,
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType = ADataType>
     auto enumerateReferenceSolutions()
     {
         using ReferenceOp = ReferenceContraction_M2_N2_K2<NumDimM,
@@ -367,12 +485,13 @@ namespace hiptensor
                                                           NumDimK,
                                                           ADataType,
                                                           BDataType,
+                                                          AccDataType,
                                                           DsDataType,
                                                           EDataType,
-                                                          EDataType,
                                                           AElementwiseOperation,
                                                           BElementwiseOperation,
-                                                          CDEElementwiseOperation>;
+                                                          CDEElementwiseOperation,
+                                                          ComputeDataType>;
 
         auto solution = std::make_unique<ContractionSolutionImpl<ReferenceOp>>(
             std::make_unique<ReferenceOp>());
diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp
index 106dd5ff..60c1ce49 100644
--- a/library/src/contraction/contraction_cpu_reference_instances.cpp
+++ b/library/src/contraction/contraction_cpu_reference_instances.cpp
@@ -32,6 +32,36 @@ namespace hiptensor
     ContractionCpuReferenceInstances::ContractionCpuReferenceInstances()
     {
         // Register all the solutions exactly once
+        // Bilinear f16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        float,
+                                        ck::Tuple<ck::half_t>,
+                                        ck::half_t,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
+        // Bilinear bf16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        ck::bhalf_t,
+                                        ck::bhalf_t,
+                                        float,
+                                        ck::Tuple<ck::bhalf_t>,
+                                        ck::bhalf_t,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
         // Bilinear f32
         registerSolutions(
             enumerateReferenceSolutions<2,
@@ -39,11 +69,56 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
+                                        ck::Tuple<float>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        ck::half_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        float,
+                                        ck::Tuple<float>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        ck::bhalf_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        float,
                                         ck::Tuple<float>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear>());
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
+        // Bilinear complex f32
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        float,
+                                        ck::Tuple<hipFloatComplex>,
+                                        hipFloatComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::BilinearComplex,
+                                        hipFloatComplex>());
 
         // Bilinear f64
         registerSolutions(
@@ -52,11 +127,72 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        float,
+                                        ck::Tuple<double>,
+                                        double,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        float>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        double,
+                                        double,
+                                        double,
                                         ck::Tuple<double>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Bilinear>());
+                                        ck::tensor_operation::element_wise::Bilinear,
+                                        double>());
+
+        // Bilinear complex f64
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        double,
+                                        ck::Tuple<hipDoubleComplex>,
+                                        hipDoubleComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::BilinearComplex,
+                                        hipDoubleComplex>());
+
+        // Scale f16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        ck::half_t,
+                                        ck::half_t,
+                                        float,
+                                        ck::Tuple<>,
+                                        ck::half_t,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
+        // Scale bf16
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        ck::bhalf_t,
+                                        ck::bhalf_t,
+                                        float,
+                                        ck::Tuple<>,
+                                        ck::bhalf_t,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
 
         // Scale f32
         registerSolutions(
@@ -65,11 +201,56 @@ namespace hiptensor
                                         2,
                                         float,
                                         float,
+                                        float,
+                                        ck::Tuple<>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        ck::half_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        float,
+                                        ck::Tuple<>,
+                                        float,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        ck::bhalf_t>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        float,
+                                        float,
+                                        float,
                                         ck::Tuple<>,
                                         float,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale>());
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
+        // Scale complex f32
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        float,
+                                        ck::Tuple<>,
+                                        hipFloatComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::ScaleComplex,
+                                        hipFloatComplex>());
 
         // Scale f64
         registerSolutions(
@@ -78,10 +259,41 @@ namespace hiptensor
                                         2,
                                         double,
                                         double,
+                                        float,
+                                        ck::Tuple<>,
+                                        double,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::Scale,
+                                        float>());
+
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        double,
+                                        double,
+                                        double,
                                         ck::Tuple<>,
                                         double,
                                         ck::tensor_operation::element_wise::PassThrough,
                                         ck::tensor_operation::element_wise::PassThrough,
-                                        ck::tensor_operation::element_wise::Scale>());
+                                        ck::tensor_operation::element_wise::Scale,
+                                        double>());
+
+        // Scale complex f64
+        registerSolutions(
+            enumerateReferenceSolutions<2,
+                                        2,
+                                        2,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        double,
+                                        ck::Tuple<>,
+                                        hipDoubleComplex,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::PassThrough,
+                                        ck::tensor_operation::element_wise::ScaleComplex,
+                                        hipDoubleComplex>());
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp
index 4fa7acf7..48508c6e 100644
--- a/library/src/contraction/contraction_meta_traits.hpp
+++ b/library/src/contraction/contraction_meta_traits.hpp
@@ -34,12 +34,12 @@
 #include <element_wise_operation.hpp>
 
 // hiptensor includes
+#include "device/device_element_wise_operation_complex.hpp"
 #include "data_types.hpp"
 #include "meta_traits.hpp"
 
 namespace hiptensor
 {
-
     // Partial specialize for Bilinear contraction
     template <ck::index_t NumDimsM,
               ck::index_t NumDimsN,
@@ -49,7 +49,9 @@ namespace hiptensor
               typename DsDataType,
               typename EDataType,
               typename AElementwiseOperation,
-              typename BElementwiseOperation>
+              typename BElementwiseOperation,
+              typename CDEElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ck::tensor_operation::device::DeviceContractionMultipleD<
         NumDimsM,
         NumDimsN,
@@ -60,18 +62,38 @@ namespace hiptensor
         EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        ck::tensor_operation::element_wise::Bilinear>>
+        CDEElementwiseOperation,
+        ComputeDataType>,
+        std::enable_if_t<(std::is_same_v<CDEElementwiseOperation,
+                                         ck::tensor_operation::element_wise::Bilinear>) ||
+                         (std::is_same_v<CDEElementwiseOperation,
+                                         ck::tensor_operation::element_wise::BilinearComplex>)>>
     {
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
         constexpr static ck::index_t DimsK = NumDimsK;
-        using ADataT                       = ADataType;
-        using BDataT                       = BDataType;
-        using DDataT                       = DsDataType;
-        using EDataT                       = EDataType;
-        using AOp                          = AElementwiseOperation;
-        using BOp                          = BElementwiseOperation;
-        using CDEOp                        = ck::tensor_operation::element_wise::Bilinear;
+        /*
+         * CK does not use hip_bfloat16, instead it use ushort(ck::bhalf_t) for cuda bhalf_t type.
+         * What we want here is that we can use ck::bhalf_t with ck instances and use hip_bfloat16
+         * with hiptensor classes.
+         *
+         * When creating a solution, ck::bhalf_t was passed in to create ck instance.
+         * When registering the solution, MetaTraits will returen hip_bfloat16 to create key.
+         */
+        using ADataT
+            = std::conditional_t<std::is_same_v<ADataType, ck::bhalf_t>, hip_bfloat16, ADataType>;
+        using BDataT
+            = std::conditional_t<std::is_same_v<BDataType, ck::bhalf_t>, hip_bfloat16, BDataType>;
+        using DDataT
+            = std::conditional_t<std::is_same_v<DsDataType, ck::bhalf_t>, hip_bfloat16, DsDataType>;
+        using EDataT
+            = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
+        using ComputeDataT = std::conditional_t<std::is_same_v<ComputeDataType, ck::bhalf_t>,
+                                                hip_bfloat16,
+                                                ComputeDataType>;
+        using AOp          = AElementwiseOperation;
+        using BOp          = BElementwiseOperation;
+        using CDEOp        = CDEElementwiseOperation;
     };
 
     // Partial specialize for Scale contraction
@@ -82,7 +104,9 @@ namespace hiptensor
               typename BDataType,
               typename EDataType,
               typename AElementwiseOperation,
-              typename BElementwiseOperation>
+              typename BElementwiseOperation,
+              typename CDEElementwiseOperation,
+              typename ComputeDataType>
     struct MetaTraits<ck::tensor_operation::device::DeviceContractionMultipleD<
         NumDimsM,
         NumDimsN,
@@ -93,18 +117,29 @@ namespace hiptensor
         EDataType,
         AElementwiseOperation,
         BElementwiseOperation,
-        ck::tensor_operation::element_wise::Scale>>
+        CDEElementwiseOperation,
+        ComputeDataType>,
+        std::enable_if_t<(std::is_same_v<CDEElementwiseOperation,
+                                         ck::tensor_operation::element_wise::Scale>) ||
+                         (std::is_same_v<CDEElementwiseOperation,
+                                         ck::tensor_operation::element_wise::ScaleComplex>)>>
     {
         constexpr static ck::index_t DimsM = NumDimsM;
         constexpr static ck::index_t DimsN = NumDimsN;
         constexpr static ck::index_t DimsK = NumDimsK;
-        using ADataT                       = ADataType;
-        using BDataT                       = BDataType;
-        using DDataT                       = NoneType;
-        using EDataT                       = EDataType;
-        using AOp                          = AElementwiseOperation;
-        using BOp                          = BElementwiseOperation;
-        using CDEOp                        = ck::tensor_operation::element_wise::Scale;
+        using ADataT
+            = std::conditional_t<std::is_same_v<ADataType, ck::bhalf_t>, hip_bfloat16, ADataType>;
+        using BDataT
+            = std::conditional_t<std::is_same_v<BDataType, ck::bhalf_t>, hip_bfloat16, BDataType>;
+        using DDataT = NoneType;
+        using EDataT
+            = std::conditional_t<std::is_same_v<EDataType, ck::bhalf_t>, hip_bfloat16, EDataType>;
+        using ComputeDataT = std::conditional_t<std::is_same_v<ComputeDataType, ck::bhalf_t>,
+                                                hip_bfloat16,
+                                                ComputeDataType>;
+        using AOp          = AElementwiseOperation;
+        using BOp          = BElementwiseOperation;
+        using CDEOp        = CDEElementwiseOperation;
     };
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp
new file mode 100644
index 00000000..5032fa8a
--- /dev/null
+++ b/library/src/contraction/contraction_pack_util.hpp
@@ -0,0 +1,140 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_CONTRACTION_PACK_UTIL_HPP
+#define HIPTENSOR_CONTRACTION_PACK_UTIL_HPP
+
+#include "data_types.hpp"
+#include "util.hpp"
+#include <hiptensor/hiptensor.hpp>
+
+namespace hiptensor
+{
+    /**
+     * \brief This function performs multiply-accumulate of the form E = accum * alpha + D * beta
+     *
+     */
+    template <typename DataType>
+    __global__ void mfma(DataType* mE_real, DataType* mE_imag, DataType* mD_real, DataType* mD_imag,
+                         HIP_vector_type<DataType, 2> *mE_grid, HIP_vector_type<double, 2> alpha,
+                         HIP_vector_type<double, 2> beta, int length)
+    {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(idx < length)
+        {
+            if constexpr(std::is_same_v<DataType, float>)
+            {
+                mE_grid[idx] = hipCaddf(
+                                        hipCmulf(
+                                                make_hipFloatComplex(mE_real[idx], mE_imag[idx]),
+                                                hipComplexDoubleToFloat(alpha)),
+                                        hipCmulf(
+                                                make_hipFloatComplex(mD_real[idx], mD_imag[idx]),
+                                                hipComplexDoubleToFloat(beta)));
+            }
+            else if constexpr(std::is_same_v<DataType, double>)
+            {
+                mE_grid[idx] = hipCadd(hipCmul(
+                                              make_hipDoubleComplex(mE_real[idx], mE_imag[idx]),
+                                              alpha),
+                                       hipCmul(
+                                              make_hipDoubleComplex(mD_real[idx], mD_imag[idx]),
+                                              beta));
+           }
+        }
+    }
+
+    /**
+     * \brief This function performs multiply of the form C = accum * alpha
+     *
+     */
+    template <typename DataType>
+    __global__ void multiply(DataType* mE_real, DataType* mE_imag, HIP_vector_type<DataType, 2> *mE_grid,
+                             HIP_vector_type<double, 2> alpha, int length)
+    {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(idx < length)
+        {
+            if constexpr(std::is_same_v<DataType, float>)
+            {
+                mE_grid[idx] = hipCmulf(
+                                      make_hipFloatComplex(mE_real[idx], mE_imag[idx]),
+                                      hipComplexDoubleToFloat(alpha));
+            }
+            else if constexpr(std::is_same_v<DataType, double>)
+            {
+                mE_grid[idx] = hipCmul(
+                                    make_hipDoubleComplex(mE_real[idx], mE_imag[idx]),
+                                    alpha);
+           }
+        }
+    }
+
+    /**
+     * \brief This function unpacks structured data (hipFloatComplex / hipDoubleComplex)
+     *        into non-structured data (float / double).
+     */
+    template<typename InputType, typename OutputType>
+    __global__ void unpack(const InputType* in, OutputType* out_real, OutputType *out_img, int length)
+    {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+        if(idx < length)
+        {
+            if constexpr(std::is_same_v<InputType,hipFloatComplex>)
+            {
+                out_real[idx] = hipCrealf(in[idx]);
+                out_img[idx] = hipCimagf(in[idx]);
+            }
+            else if constexpr(std::is_same_v<InputType,hipDoubleComplex>)
+            {
+                out_real[idx] = hipCreal(in[idx]);
+                out_img[idx] = hipCimag(in[idx]);
+            }
+        }
+    }
+
+    struct DeviceDeleter
+    {
+        void operator()(void* ptr)
+        {
+            CHECK_HIP_ERROR(hipFree(ptr));
+        }
+    };
+
+    template<typename T>
+    auto allocDevice(int64_t numElements)
+    {
+        T* data;
+        CHECK_HIP_ERROR(hipMalloc(&data, numElements * sizeof(T)));
+        return std::unique_ptr<T, DeviceDeleter>(data, DeviceDeleter());
+    }
+
+} // namespace hiptensor
+
+#endif // HIPTENSOR_CONTRACTION_PACK_UTIL_HPP
diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp
index aaa624f6..f96e8412 100644
--- a/library/src/contraction/contraction_selection.cpp
+++ b/library/src/contraction/contraction_selection.cpp
@@ -54,6 +54,7 @@ namespace hiptensor
                                       hipDataType                              typeE,
                                       std::vector<std::size_t> const&          e_ms_ns_lengths,
                                       std::vector<std::size_t> const&          e_ms_ns_strides,
+                                      hiptensorComputeType_t                   computeType,
                                       const uint64_t                           workspaceSize)
     {
         // Make sure that we calculate full element space incase strides are not packed.
@@ -71,8 +72,27 @@ namespace hiptensor
                      * hipDataTypeSize(typeE);
 
         void *A_d, *B_d, *D_d, *E_d, *wspace;
-        float alpha = 1.02f;
-        float beta  = 1.03f;
+
+        /*
+         * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha.
+         * ```
+         * alphaF = hiptensor::readVal<float>(
+         *      alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
+         * ```
+         * Hence, the `alpha` and `bete` need to point to a ComputeData value
+         */
+        ScalarData alpha;
+        ScalarData beta;
+        if(computeType == HIPTENSOR_COMPUTE_C32F || computeType == HIPTENSOR_COMPUTE_C64F)
+        {
+            writeVal(&alpha, computeType, {computeType, 1.02, 1.03});
+            writeVal(&beta, computeType, {computeType, 1.04, 1.05});
+        }
+        else
+        {
+            writeVal(&alpha, computeType, ScalarData(computeType, 1.02));
+            writeVal(&beta, computeType, ScalarData(computeType, 1.03));
+        }
 
         CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA));
         CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB));
@@ -151,7 +171,12 @@ namespace hiptensor
     }
 
     template <>
-    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>
+    struct ActorCriticSelection<_Float16,
+                                _Float16,
+                                _Float16,
+                                _Float16,
+                                ContractionOpId_t::SCALE,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -179,329 +204,55 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            if(d6 <= 43)
+            unique_id = 11124293857315312720ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
-                if(d5 <= 61)
-                {
-                    if(d3 <= 236)
-                    {
-                        if(d4 <= 519)
-                        {
-                            if(d1 <= 744)
-                            {
-                                if(d6 <= 8)
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                                else
-                                {
-                                    unique_id = 17304057348073251997ull;
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d3 <= 32)
-                            {
-                                unique_id = 17304057348073251997ull;
-                            }
-                            else
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d6 <= 2)
-                        {
-                            if(d5 <= 15)
-                            {
-                                unique_id = 17618515137355245877ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 1)
-                                {
-                                    unique_id = 10830479759059230274ull;
-                                }
-                                else
-                                {
-                                    if(d5 <= 32)
-                                    {
-                                        unique_id = 10830479759059230274ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 4671301146928673150ull;
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d5 <= 2)
-                            {
-                                if(d6 <= 8)
-                                {
-                                    unique_id = 17618515137355245877ull;
-                                }
-                                else
-                                {
-                                    unique_id = 10830479759059230274ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d1 <= 54)
-                                {
-                                    unique_id = 17304057348073251997ull;
-                                }
-                                else
-                                {
-                                    if(d4 <= 218)
-                                    {
-                                        if(d5 <= 36)
-                                        {
-                                            unique_id = 4671301146928673150ull;
-                                        }
-                                        else
-                                        {
-                                            if(d6 <= 31)
-                                            {
-                                                unique_id = 4671301146928673150ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 16481146763982821264ull;
-                                            }
-                                        }
-                                    }
-                                    else
-                                    {
-                                        if(d2 <= 50)
-                                        {
-                                            unique_id = 4671301146928673150ull;
-                                        }
-                                        else
-                                        {
-                                            if(d6 <= 31)
-                                            {
-                                                unique_id = 4671301146928673150ull;
-                                            }
-                                            else
-                                            {
-                                                if(d6 <= 32)
-                                                {
-                                                    unique_id = 10830479759059230274ull;
-                                                }
-                                                else
-                                                {
-                                                    unique_id = 4671301146928673150ull;
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d6 <= 18)
-                    {
-                        unique_id = 4671301146928673150ull;
-                    }
-                    else
-                    {
-                        if(d4 <= 557)
-                        {
-                            if(d2 <= 165)
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                            else
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d5 <= 68)
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                            else
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                        }
-                    }
-                }
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
             }
             else
             {
-                if(d5 <= 24)
-                {
-                    if(d3 <= 435)
-                    {
-                        if(d5 <= 7)
-                        {
-                            if(d5 <= 1)
-                            {
-                                unique_id = 3454820663416883703ull;
-                            }
-                            else
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 744)
-                            {
-                                unique_id = 17304057348073251997ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 60)
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                                else
-                                {
-                                    unique_id = 17304057348073251997ull;
-                                }
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d5 <= 1)
-                        {
-                            unique_id = 3454820663416883703ull;
-                        }
-                        else
-                        {
-                            if(d5 <= 13)
-                            {
-                                if(d5 <= 7)
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d6 <= 58)
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                                else
-                                {
-                                    if(d1 <= 642)
-                                    {
-                                        unique_id = 17304057348073251997ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 16481146763982821264ull;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d6 <= 54)
-                    {
-                        if(d5 <= 37)
-                        {
-                            if(d4 <= 556)
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                            else
-                            {
-                                unique_id = 4671301146928673150ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 222)
-                            {
-                                if(d4 <= 556)
-                                {
-                                    unique_id = 16481146763982821264ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4671301146928673150ull;
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d4 <= 44)
-                        {
-                            if(d3 <= 436)
-                            {
-                                unique_id = 17304057348073251997ull;
-                            }
-                            else
-                            {
-                                unique_id = 16481146763982821264ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 220)
-                            {
-                                if(d2 <= 107)
-                                {
-                                    unique_id = 17304057348073251997ull;
-                                }
-                                else
-                                {
-                                    unique_id = 16481146763982821264ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d3 <= 72)
-                                {
-                                    unique_id = 16481146763982821264ull;
-                                }
-                                else
-                                {
-                                    if(d2 <= 18)
-                                    {
-                                        unique_id = 4671301146928673150ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 16481146763982821264ull;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
             }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<_Float16,
+                                _Float16,
+                                _Float16,
+                                _Float16,
+                                ContractionOpId_t::BILINEAR,
+                                float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 1953020431947874122ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -516,7 +267,12 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR>
+    struct ActorCriticSelection<hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                ContractionOpId_t::SCALE,
+                                float>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -544,322 +300,55 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            if(d6 <= 9)
+            unique_id = 14895098881714635802ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
-                if(d6 <= 4)
-                {
-                    unique_id = 9622108777680582053ull;
-                }
-                else
-                {
-                    if(d5 <= 16)
-                    {
-                        unique_id = 9622108777680582053ull;
-                    }
-                    else
-                    {
-                        if(d2 <= 196)
-                        {
-                            unique_id = 9622108777680582053ull;
-                        }
-                        else
-                        {
-                            if(d1 <= 113)
-                            {
-                                unique_id = 9622108777680582053ull;
-                            }
-                            else
-                            {
-                                if(d3 <= 219)
-                                {
-                                    unique_id = 9622108777680582053ull;
-                                }
-                                else
-                                {
-                                    unique_id = 13257779901106960809ull;
-                                }
-                            }
-                        }
-                    }
-                }
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
             }
             else
             {
-                if(d5 <= 8)
-                {
-                    if(d6 <= 28)
-                    {
-                        unique_id = 9622108777680582053ull;
-                    }
-                    else
-                    {
-                        if(d5 <= 2)
-                        {
-                            if(d6 <= 58)
-                            {
-                                unique_id = 9622108777680582053ull;
-                            }
-                            else
-                            {
-                                if(d5 <= 1)
-                                {
-                                    unique_id = 9622108777680582053ull;
-                                }
-                                else
-                                {
-                                    unique_id = 13257779901106960809ull;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d2 <= 163)
-                            {
-                                unique_id = 9622108777680582053ull;
-                            }
-                            else
-                            {
-                                if(d1 <= 465)
-                                {
-                                    unique_id = 9622108777680582053ull;
-                                }
-                                else
-                                {
-                                    unique_id = 13257779901106960809ull;
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d3 <= 121)
-                    {
-                        if(d4 <= 483)
-                        {
-                            if(d6 <= 29)
-                            {
-                                if(d5 <= 32)
-                                {
-                                    unique_id = 9622108777680582053ull;
-                                }
-                                else
-                                {
-                                    unique_id = 222393107113976106ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d5 <= 39)
-                                {
-                                    unique_id = 222393107113976106ull;
-                                }
-                                else
-                                {
-                                    if(d2 <= 152)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 13257779901106960809ull;
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d3 <= 37)
-                            {
-                                unique_id = 222393107113976106ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 29)
-                                {
-                                    if(d5 <= 32)
-                                    {
-                                        unique_id = 9622108777680582053ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 15066925687960442338ull;
-                                    }
-                                }
-                                else
-                                {
-                                    unique_id = 15066925687960442338ull;
-                                }
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d4 <= 135)
-                        {
-                            if(d3 <= 413)
-                            {
-                                if(d6 <= 30)
-                                {
-                                    if(d5 <= 32)
-                                    {
-                                        unique_id = 9622108777680582053ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                }
-                                else
-                                {
-                                    if(d5 <= 39)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 13257779901106960809ull;
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                if(d4 <= 36)
-                                {
-                                    unique_id = 222393107113976106ull;
-                                }
-                                else
-                                {
-                                    if(d2 <= 120)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        if(d6 <= 32)
-                                        {
-                                            if(d5 <= 32)
-                                            {
-                                                unique_id = 13257779901106960809ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 15066925687960442338ull;
-                                            }
-                                        }
-                                        else
-                                        {
-                                            unique_id = 15066925687960442338ull;
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d2 <= 115)
-                            {
-                                if(d6 <= 40)
-                                {
-                                    if(d2 <= 51)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        if(d5 <= 32)
-                                        {
-                                            unique_id = 9622108777680582053ull;
-                                        }
-                                        else
-                                        {
-                                            if(d4 <= 486)
-                                            {
-                                                unique_id = 222393107113976106ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 15066925687960442338ull;
-                                            }
-                                        }
-                                    }
-                                }
-                                else
-                                {
-                                    if(d1 <= 235)
-                                    {
-                                        unique_id = 222393107113976106ull;
-                                    }
-                                    else
-                                    {
-                                        if(d2 <= 22)
-                                        {
-                                            unique_id = 222393107113976106ull;
-                                        }
-                                        else
-                                        {
-                                            unique_id = 15066925687960442338ull;
-                                        }
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                if(d6 <= 32)
-                                {
-                                    if(d5 <= 26)
-                                    {
-                                        if(d6 <= 23)
-                                        {
-                                            if(d1 <= 116)
-                                            {
-                                                unique_id = 9622108777680582053ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 13257779901106960809ull;
-                                            }
-                                        }
-                                        else
-                                        {
-                                            if(d5 <= 18)
-                                            {
-                                                unique_id = 13257779901106960809ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 15066925687960442338ull;
-                                            }
-                                        }
-                                    }
-                                    else
-                                    {
-                                        if(d5 <= 32)
-                                        {
-                                            if(d6 <= 16)
-                                            {
-                                                unique_id = 13257779901106960809ull;
-                                            }
-                                            else
-                                            {
-                                                unique_id = 15066925687960442338ull;
-                                            }
-                                        }
-                                        else
-                                        {
-                                            unique_id = 15066925687960442338ull;
-                                        }
-                                    }
-                                }
-                                else
-                                {
-                                    unique_id = 15066925687960442338ull;
-                                }
-                            }
-                        }
-                    }
-                }
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
             }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                hip_bfloat16,
+                                ContractionOpId_t::BILINEAR,
+                                float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 8517235228581081946ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -874,7 +363,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, _Float16>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -893,7 +382,6 @@ namespace hiptensor
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
                          const uint64_t                                          workspaceSize)
         {
-
             int d1 = a_ms_ks_lengths[0];
             int d2 = a_ms_ks_lengths[1];
             int d3 = b_ns_ks_lengths[0];
@@ -903,238 +391,50 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            if(d5 <= 36)
+            unique_id = 17313709378682913599ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
-                if(d6 <= 35)
-                {
-                    if(d1 <= 763)
-                    {
-                        if(d6 <= 3)
-                        {
-                            if(d5 <= 8)
-                            {
-                                unique_id = 9769367948782541618ull;
-                            }
-                            else
-                            {
-                                unique_id = 3344638327382374968ull;
-                            }
-                        }
-                        else
-                        {
-                            unique_id = 3344638327382374968ull;
-                        }
-                    }
-                    else
-                    {
-                        if(d6 <= 24)
-                        {
-                            unique_id = 3344638327382374968ull;
-                        }
-                        else
-                        {
-                            if(d5 <= 17)
-                            {
-                                unique_id = 3344638327382374968ull;
-                            }
-                            else
-                            {
-                                unique_id = 2770278462698889442ull;
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d5 <= 9)
-                    {
-                        unique_id = 3344638327382374968ull;
-                    }
-                    else
-                    {
-                        if(d1 <= 759)
-                        {
-                            if(d6 <= 67)
-                            {
-                                if(d3 <= 535)
-                                {
-                                    unique_id = 3344638327382374968ull;
-                                }
-                                else
-                                {
-                                    if(d4 <= 615)
-                                    {
-                                        unique_id = 3344638327382374968ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 2770278462698889442ull;
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                if(d5 <= 25)
-                                {
-                                    if(d4 <= 428)
-                                    {
-                                        unique_id = 3344638327382374968ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 2770278462698889442ull;
-                                    }
-                                }
-                                else
-                                {
-                                    unique_id = 16588612317409292216ull;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d6 <= 64)
-                            {
-                                if(d3 <= 65)
-                                {
-                                    unique_id = 3344638327382374968ull;
-                                }
-                                else
-                                {
-                                    unique_id = 2770278462698889442ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d5 <= 25)
-                                {
-                                    unique_id = 2770278462698889442ull;
-                                }
-                                else
-                                {
-                                    unique_id = 16588612317409292216ull;
-                                }
-                            }
-                        }
-                    }
-                }
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
             }
             else
             {
-                if(d6 <= 33)
-                {
-                    if(d6 <= 8)
-                    {
-                        unique_id = 3344638327382374968ull;
-                    }
-                    else
-                    {
-                        if(d2 <= 565)
-                        {
-                            if(d1 <= 646)
-                            {
-                                unique_id = 3344638327382374968ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 27)
-                                {
-                                    unique_id = 3344638327382374968ull;
-                                }
-                                else
-                                {
-                                    if(d5 <= 53)
-                                    {
-                                        unique_id = 2770278462698889442ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 16588612317409292216ull;
-                                    }
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d6 <= 20)
-                            {
-                                if(d3 <= 168)
-                                {
-                                    unique_id = 3344638327382374968ull;
-                                }
-                                else
-                                {
-                                    unique_id = 2770278462698889442ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d5 <= 64)
-                                {
-                                    if(d1 <= 648)
-                                    {
-                                        unique_id = 3344638327382374968ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 2770278462698889442ull;
-                                    }
-                                }
-                                else
-                                {
-                                    if(d6 <= 25)
-                                    {
-                                        unique_id = 3344638327382374968ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 16588612317409292216ull;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d5 <= 45)
-                    {
-                        if(d6 <= 50)
-                        {
-                            if(d3 <= 168)
-                            {
-                                unique_id = 3344638327382374968ull;
-                            }
-                            else
-                            {
-                                unique_id = 2770278462698889442ull;
-                            }
-                        }
-                        else
-                        {
-                            unique_id = 16588612317409292216ull;
-                        }
-                    }
-                    else
-                    {
-                        if(d6 <= 43)
-                        {
-                            if(d5 <= 52)
-                            {
-                                unique_id = 2770278462698889442ull;
-                            }
-                            else
-                            {
-                                unique_id = 16588612317409292216ull;
-                            }
-                        }
-                        else
-                        {
-                            unique_id = 16588612317409292216ull;
-                        }
-                    }
-                }
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
             }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR, _Float16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 14397647188602189900ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1149,7 +449,7 @@ namespace hiptensor
     };
 
     template <>
-    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, hip_bfloat16>
     {
         static hiptensorStatus_t
             selectWinner(ContractionSolution**                                   winner,
@@ -1177,217 +477,55 @@ namespace hiptensor
 
             size_t unique_id = 0;
 
-            if(d5 <= 39)
+            unique_id = 8339198051871565944ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
-                if(d3 <= 937)
-                {
-                    if(d6 <= 1)
-                    {
-                        unique_id = 1830537384143755749ull;
-                    }
-                    else
-                    {
-                        if(d4 <= 754)
-                        {
-                            if(d5 <= 33)
-                            {
-                                if(d5 <= 1)
-                                {
-                                    if(d6 <= 25)
-                                    {
-                                        unique_id = 3423207643344265161ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 1830537384143755749ull;
-                                    }
-                                }
-                                else
-                                {
-                                    if(d6 <= 6)
-                                    {
-                                        if(d5 <= 8)
-                                        {
-                                            unique_id = 3423207643344265161ull;
-                                        }
-                                        else
-                                        {
-                                            unique_id = 1830537384143755749ull;
-                                        }
-                                    }
-                                    else
-                                    {
-                                        unique_id = 1830537384143755749ull;
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 1830537384143755749ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 404)
-                            {
-                                unique_id = 1830537384143755749ull;
-                            }
-                            else
-                            {
-                                if(d6 <= 50)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    if(d5 <= 33)
-                                    {
-                                        unique_id = 1830537384143755749ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 4992687403741300893ull;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    unique_id = 1830537384143755749ull;
-                }
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
             }
             else
             {
-                if(d6 <= 32)
-                {
-                    if(d2 <= 832)
-                    {
-                        unique_id = 1830537384143755749ull;
-                    }
-                    else
-                    {
-                        if(d6 <= 8)
-                        {
-                            unique_id = 1830537384143755749ull;
-                        }
-                        else
-                        {
-                            if(d6 <= 24)
-                            {
-                                unique_id = 17689908062647780665ull;
-                            }
-                            else
-                            {
-                                if(d5 <= 64)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    if(d6 <= 46)
-                    {
-                        if(d5 <= 54)
-                        {
-                            if(d1 <= 460)
-                            {
-                                unique_id = 1830537384143755749ull;
-                            }
-                            else
-                            {
-                                if(d5 <= 49)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                        }
-                        else
-                        {
-                            if(d1 <= 182)
-                            {
-                                if(d5 <= 65)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                            else
-                            {
-                                if(d2 <= 33)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(d5 <= 49)
-                        {
-                            if(d6 <= 64)
-                            {
-                                if(d1 <= 411)
-                                {
-                                    if(d2 <= 396)
-                                    {
-                                        unique_id = 1830537384143755749ull;
-                                    }
-                                    else
-                                    {
-                                        unique_id = 4992687403741300893ull;
-                                    }
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 4992687403741300893ull;
-                            }
-                        }
-                        else
-                        {
-                            if(d2 <= 53)
-                            {
-                                if(d1 <= 222)
-                                {
-                                    unique_id = 1830537384143755749ull;
-                                }
-                                else
-                                {
-                                    unique_id = 4992687403741300893ull;
-                                }
-                            }
-                            else
-                            {
-                                unique_id = 4992687403741300893ull;
-                            }
-                        }
-                    }
-                }
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
             }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<float,
+                                float,
+                                float,
+                                float,
+                                ContractionOpId_t::BILINEAR,
+                                hip_bfloat16>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 2724417728984064737ull;
 
             if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
             {
@@ -1401,8 +539,11 @@ namespace hiptensor
         }
     };
 
-    hiptensorStatus_t
-        actorCriticModel(ContractionSolution**                                   winner,
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
                          std::unordered_map<size_t, ContractionSolution*> const& candidates,
                          hipDataType                                             typeA,
                          std::vector<std::size_t> const&                         a_ms_ks_lengths,
@@ -1417,88 +558,889 @@ namespace hiptensor
                          std::vector<std::size_t> const&                         e_ms_ns_lengths,
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
                          const uint64_t                                          workspaceSize)
-    {
-        if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F)
-        {
-            return ActorCriticSelection<float, float, float, float, ContractionOpId_t::SCALE>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
-        }
-        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F
-                && typeE == HIP_R_32F)
         {
-            return ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 5943247903036531691ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
         }
-        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE
-                && typeE == HIP_R_64F)
+    };
+
+    template <>
+    struct ActorCriticSelection<float, float, float, float, ContractionOpId_t::BILINEAR, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
         {
-            return ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE>::
-                selectWinner(winner,
-                             candidates,
-                             typeA,
-                             a_ms_ks_lengths,
-                             a_ms_ks_strides,
-                             typeB,
-                             b_ns_ks_lengths,
-                             b_ns_ks_strides,
-                             typeD,
-                             d_ms_ns_lengths,
-                             d_ms_ns_strides,
-                             typeE,
-                             e_ms_ns_lengths,
-                             e_ms_ns_strides,
-                             workspaceSize);
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 17972447156160297755ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
         }
-        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F
-                && typeE == HIP_R_64F)
-        {
-            return ActorCriticSelection<double,
-                                        double,
-                                        double,
-                                        double,
-                                        ContractionOpId_t::BILINEAR>::selectWinner(winner,
-                                                                                   candidates,
-                                                                                   typeA,
-                                                                                   a_ms_ks_lengths,
-                                                                                   a_ms_ks_strides,
-                                                                                   typeB,
-                                                                                   b_ns_ks_lengths,
-                                                                                   b_ns_ks_strides,
-                                                                                   typeD,
-                                                                                   d_ms_ns_lengths,
-                                                                                   d_ms_ns_strides,
-                                                                                   typeE,
-                                                                                   e_ms_ns_lengths,
-                                                                                   e_ms_ns_strides,
-                                                                                   workspaceSize);
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 3893144338697524749ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, float>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+            unique_id        = 15165261158317928321ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::SCALE, double>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 14511729289005214097ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<double, double, double, double, ContractionOpId_t::BILINEAR, double>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 3636246152928348445ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hipFloatComplex,
+                                hipFloatComplex,
+                                hipFloatComplex,
+                                hipFloatComplex,
+                                ContractionOpId_t::SCALE_COMPLEX,
+                                hipFloatComplex>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 5711776907278244209ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hipFloatComplex,
+                                hipFloatComplex,
+                                hipFloatComplex,
+                                hipFloatComplex,
+                                ContractionOpId_t::BILINEAR_COMPLEX,
+                                hipFloatComplex>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 355777364055884033ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hipDoubleComplex,
+                                hipDoubleComplex,
+                                hipDoubleComplex,
+                                hipDoubleComplex,
+                                ContractionOpId_t::SCALE_COMPLEX,
+                                hipDoubleComplex>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 3085227716611397774ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    template <>
+    struct ActorCriticSelection<hipDoubleComplex,
+                                hipDoubleComplex,
+                                hipDoubleComplex,
+                                hipDoubleComplex,
+                                ContractionOpId_t::BILINEAR_COMPLEX,
+                                hipDoubleComplex>
+    {
+        static hiptensorStatus_t
+            selectWinner(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         const uint64_t                                          workspaceSize)
+        {
+            int d1 = a_ms_ks_lengths[0];
+            int d2 = a_ms_ks_lengths[1];
+            int d3 = b_ns_ks_lengths[0];
+            int d4 = b_ns_ks_lengths[1];
+            int d5 = a_ms_ks_lengths[2];
+            int d6 = a_ms_ks_lengths[3];
+
+            size_t unique_id = 0;
+
+            unique_id = 2196983681630807584ull;
+
+            if(auto candidate = candidates.find(unique_id); candidate != candidates.end())
+            {
+                *winner = candidate->second;
+                return HIPTENSOR_STATUS_SUCCESS;
+            }
+            else
+            {
+                return HIPTENSOR_STATUS_EXECUTION_FAILED;
+            }
+        }
+    };
+
+    hiptensorStatus_t
+        actorCriticModel(ContractionSolution**                                   winner,
+                         std::unordered_map<size_t, ContractionSolution*> const& candidates,
+                         hipDataType                                             typeA,
+                         std::vector<std::size_t> const&                         a_ms_ks_lengths,
+                         std::vector<std::size_t> const&                         a_ms_ks_strides,
+                         hipDataType                                             typeB,
+                         std::vector<std::size_t> const&                         b_ns_ks_lengths,
+                         std::vector<std::size_t> const&                         b_ns_ks_strides,
+                         hipDataType                                             typeD,
+                         std::vector<std::size_t> const&                         d_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         d_ms_ns_strides,
+                         hipDataType                                             typeE,
+                         std::vector<std::size_t> const&                         e_ms_ns_lengths,
+                         std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         hiptensorComputeType_t                                  computeType,
+                         const uint64_t                                          workspaceSize)
+    {
+        if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F
+           && computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return ActorCriticSelection<_Float16,
+                                        _Float16,
+                                        _Float16,
+                                        _Float16,
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F
+                && computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return ActorCriticSelection<_Float16,
+                                        _Float16,
+                                        _Float16,
+                                        _Float16,
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE
+                && typeE == HIP_R_16BF && computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return ActorCriticSelection<hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF
+                && typeE == HIP_R_16BF && computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return ActorCriticSelection<hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        hip_bfloat16,
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIPTENSOR_COMPUTE_16F)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        _Float16>::selectWinner(winner,
+                                                                candidates,
+                                                                typeA,
+                                                                a_ms_ks_lengths,
+                                                                a_ms_ks_strides,
+                                                                typeB,
+                                                                b_ns_ks_lengths,
+                                                                b_ns_ks_strides,
+                                                                typeD,
+                                                                d_ms_ns_lengths,
+                                                                d_ms_ns_strides,
+                                                                typeE,
+                                                                e_ms_ns_lengths,
+                                                                e_ms_ns_strides,
+                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIPTENSOR_COMPUTE_16F)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        _Float16>::selectWinner(winner,
+                                                                candidates,
+                                                                typeA,
+                                                                a_ms_ks_lengths,
+                                                                a_ms_ks_strides,
+                                                                typeB,
+                                                                b_ns_ks_lengths,
+                                                                b_ns_ks_strides,
+                                                                typeD,
+                                                                d_ms_ns_lengths,
+                                                                d_ms_ns_strides,
+                                                                typeE,
+                                                                e_ms_ns_lengths,
+                                                                e_ms_ns_strides,
+                                                                workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIP_R_16BF)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        hip_bfloat16>::selectWinner(winner,
+                                                                    candidates,
+                                                                    typeA,
+                                                                    a_ms_ks_lengths,
+                                                                    a_ms_ks_strides,
+                                                                    typeB,
+                                                                    b_ns_ks_lengths,
+                                                                    b_ns_ks_strides,
+                                                                    typeD,
+                                                                    d_ms_ns_lengths,
+                                                                    d_ms_ns_strides,
+                                                                    typeE,
+                                                                    e_ms_ns_lengths,
+                                                                    e_ms_ns_strides,
+                                                                    workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIP_R_16BF)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        hip_bfloat16>::selectWinner(winner,
+                                                                    candidates,
+                                                                    typeA,
+                                                                    a_ms_ks_lengths,
+                                                                    a_ms_ks_strides,
+                                                                    typeB,
+                                                                    b_ns_ks_lengths,
+                                                                    b_ns_ks_strides,
+                                                                    typeD,
+                                                                    d_ms_ns_lengths,
+                                                                    d_ms_ns_strides,
+                                                                    typeE,
+                                                                    e_ms_ns_lengths,
+                                                                    e_ms_ns_strides,
+                                                                    workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F
+                && computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F
+                && computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return ActorCriticSelection<float,
+                                        float,
+                                        float,
+                                        float,
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F
+                && computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::SCALE,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F
+                && computeType == HIPTENSOR_COMPUTE_32F)
+        {
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::BILINEAR,
+                                        float>::selectWinner(winner,
+                                                             candidates,
+                                                             typeA,
+                                                             a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             typeB,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             typeD,
+                                                             d_ms_ns_lengths,
+                                                             d_ms_ns_strides,
+                                                             typeE,
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides,
+                                                             workspaceSize);
+        }
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F
+                && computeType == HIPTENSOR_COMPUTE_64F)
+        {
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::SCALE,
+                                        double>::selectWinner(winner,
+                                                              candidates,
+                                                              typeA,
+                                                              a_ms_ks_lengths,
+                                                              a_ms_ks_strides,
+                                                              typeB,
+                                                              b_ns_ks_lengths,
+                                                              b_ns_ks_strides,
+                                                              typeD,
+                                                              d_ms_ns_lengths,
+                                                              d_ms_ns_strides,
+                                                              typeE,
+                                                              e_ms_ns_lengths,
+                                                              e_ms_ns_strides,
+                                                              workspaceSize);
+        }
+        else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F
+                && computeType == HIPTENSOR_COMPUTE_64F)
+        {
+            return ActorCriticSelection<double,
+                                        double,
+                                        double,
+                                        double,
+                                        ContractionOpId_t::BILINEAR,
+                                        double>::selectWinner(winner,
+                                                              candidates,
+                                                              typeA,
+                                                              a_ms_ks_lengths,
+                                                              a_ms_ks_strides,
+                                                              typeB,
+                                                              b_ns_ks_lengths,
+                                                              b_ns_ks_strides,
+                                                              typeD,
+                                                              d_ms_ns_lengths,
+                                                              d_ms_ns_strides,
+                                                              typeE,
+                                                              e_ms_ns_lengths,
+                                                              e_ms_ns_strides,
+                                                              workspaceSize);
+        }
+        else if(typeA == HIP_C_32F && typeB == HIP_C_32F && typeD == NONE_TYPE && typeE == HIP_C_32F
+                && computeType == HIPTENSOR_COMPUTE_C32F)
+        {
+            return ActorCriticSelection<hipFloatComplex,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        ContractionOpId_t::SCALE_COMPLEX,
+                                        hipFloatComplex>::selectWinner(winner,
+                                                                       candidates,
+                                                                       typeA,
+                                                                       a_ms_ks_lengths,
+                                                                       a_ms_ks_strides,
+                                                                       typeB,
+                                                                       b_ns_ks_lengths,
+                                                                       b_ns_ks_strides,
+                                                                       typeD,
+                                                                       d_ms_ns_lengths,
+                                                                       d_ms_ns_strides,
+                                                                       typeE,
+                                                                       e_ms_ns_lengths,
+                                                                       e_ms_ns_strides,
+                                                                       workspaceSize);
+        }
+        else if(typeA == HIP_C_32F && typeB == HIP_C_32F && typeD == HIP_C_32F && typeE == HIP_C_32F
+                && computeType == HIPTENSOR_COMPUTE_C32F)
+        {
+            return ActorCriticSelection<hipFloatComplex,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        hipFloatComplex,
+                                        ContractionOpId_t::BILINEAR_COMPLEX,
+                                        hipFloatComplex>::selectWinner(winner,
+                                                                       candidates,
+                                                                       typeA,
+                                                                       a_ms_ks_lengths,
+                                                                       a_ms_ks_strides,
+                                                                       typeB,
+                                                                       b_ns_ks_lengths,
+                                                                       b_ns_ks_strides,
+                                                                       typeD,
+                                                                       d_ms_ns_lengths,
+                                                                       d_ms_ns_strides,
+                                                                       typeE,
+                                                                       e_ms_ns_lengths,
+                                                                       e_ms_ns_strides,
+                                                                       workspaceSize);
+        }
+        else if(typeA == HIP_C_64F && typeB == HIP_C_64F && typeD == NONE_TYPE && typeE == HIP_C_64F
+                && computeType == HIPTENSOR_COMPUTE_C64F)
+        {
+            return ActorCriticSelection<hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        ContractionOpId_t::SCALE_COMPLEX,
+                                        hipDoubleComplex>::selectWinner(winner,
+                                                                        candidates,
+                                                                        typeA,
+                                                                        a_ms_ks_lengths,
+                                                                        a_ms_ks_strides,
+                                                                        typeB,
+                                                                        b_ns_ks_lengths,
+                                                                        b_ns_ks_strides,
+                                                                        typeD,
+                                                                        d_ms_ns_lengths,
+                                                                        d_ms_ns_strides,
+                                                                        typeE,
+                                                                        e_ms_ns_lengths,
+                                                                        e_ms_ns_strides,
+                                                                        workspaceSize);
+        }
+        else if(typeA == HIP_C_64F && typeB == HIP_C_64F && typeD == HIP_C_64F && typeE == HIP_C_64F
+                && computeType == HIPTENSOR_COMPUTE_C64F)
+        {
+            return ActorCriticSelection<hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        hipDoubleComplex,
+                                        ContractionOpId_t::BILINEAR_COMPLEX,
+                                        hipDoubleComplex>::selectWinner(winner,
+                                                                        candidates,
+                                                                        typeA,
+                                                                        a_ms_ks_lengths,
+                                                                        a_ms_ks_strides,
+                                                                        typeB,
+                                                                        b_ns_ks_lengths,
+                                                                        b_ns_ks_strides,
+                                                                        typeD,
+                                                                        d_ms_ns_lengths,
+                                                                        d_ms_ns_strides,
+                                                                        typeE,
+                                                                        e_ms_ns_lengths,
+                                                                        e_ms_ns_strides,
+                                                                        workspaceSize);
         }
         return HIPTENSOR_STATUS_EXECUTION_FAILED;
     }
diff --git a/library/src/contraction/contraction_selection.hpp b/library/src/contraction/contraction_selection.hpp
index 9ceb6a14..deb980d9 100644
--- a/library/src/contraction/contraction_selection.hpp
+++ b/library/src/contraction/contraction_selection.hpp
@@ -49,9 +49,15 @@ namespace hiptensor
                                       hipDataType                              typeE,
                                       std::vector<std::size_t> const&          e_ms_ns_lengths,
                                       std::vector<std::size_t> const&          e_ms_ns_strides,
+                                      hiptensorComputeType_t                   computeType,
                                       const uint64_t                           workspaceSize);
 
-    template <typename A, typename B, typename C, typename D, ContractionOpId_t ContractionOp>
+    template <typename A,
+              typename B,
+              typename C,
+              typename D,
+              ContractionOpId_t ContractionOp,
+              typename ComputeType>
     struct ActorCriticSelection
     {
         static hiptensorStatus_t
@@ -87,6 +93,7 @@ namespace hiptensor
                          hipDataType                                             typeE,
                          std::vector<std::size_t> const&                         e_ms_ns_lengths,
                          std::vector<std::size_t> const&                         e_ms_ns_strides,
+                         hiptensorComputeType_t                                  computeType,
                          const uint64_t                                          workspaceSize);
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp
index 0037584e..97dde1ca 100644
--- a/library/src/contraction/contraction_solution.hpp
+++ b/library/src/contraction/contraction_solution.hpp
@@ -38,6 +38,8 @@
 #include <device_contraction_multiple_d.hpp>
 #include <element_wise_operation.hpp>
 
+#include "device/device_element_wise_operation_complex.hpp"
+
 #include "contraction_meta_traits.hpp"
 #include "contraction_solution_params.hpp"
 #include "performance.hpp"
@@ -147,7 +149,8 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType>
     std::vector<std::unique_ptr<hiptensor::ContractionSolution>> enumerateContractionSolutions();
 
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp
index 0fb5df9d..09e300a7 100644
--- a/library/src/contraction/contraction_solution_impl.hpp
+++ b/library/src/contraction/contraction_solution_impl.hpp
@@ -35,11 +35,11 @@
 namespace std
 {
     template <>
-    struct std::hash<hiptensor::ContractionSolution>
+    struct hash<hiptensor::ContractionSolution>
     {
-        std::size_t operator()(hiptensor::ContractionSolution const& s) const noexcept
+        size_t operator()(hiptensor::ContractionSolution const& s) const noexcept
         {
-            return std::hash<hiptensor::ContractionSolutionParams>{}(*s.params());
+            return hash<hiptensor::ContractionSolutionParams>{}(*s.params());
         }
     };
 }
@@ -52,8 +52,10 @@ namespace hiptensor
     template <typename DeviceOp>
     class ContractionSolutionImpl<
         DeviceOp,
-        std::enable_if_t<std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
-                                        ck::tensor_operation::element_wise::Bilinear>>>
+        std::enable_if_t<(std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
+                                        ck::tensor_operation::element_wise::Bilinear>)
+                          || (std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
+                                        ck::tensor_operation::element_wise::BilinearComplex>)>>
         : public ContractionSolution
     {
     public:
@@ -90,16 +92,18 @@ namespace hiptensor
             auto* deviceOp = dynamic_cast<DeviceOp*>(Base::mDeviceOp.get());
 
             // Note: CK ALWAYS uses float for alpha / beta in contraction multipleD
-            auto alphaF = 0.0f;
-            auto betaF  = 0.0f;
+            ScalarData alphaF;
+            ScalarData betaF;
 
             if(alpha != nullptr)
             {
-                alphaF = hiptensor::readVal<float>(alpha, HipDataType_v<typename Traits::EDataT>);
+                alphaF = hiptensor::readVal<ScalarData>(
+                    alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
             if(beta != nullptr)
             {
-                betaF = hiptensor::readVal<float>(beta, HipDataType_v<typename Traits::EDataT>);
+                betaF = hiptensor::readVal<ScalarData>(
+                    beta, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
 
             // CK has its own format for indices...
@@ -123,7 +127,7 @@ namespace hiptensor
                 toCKVec(e_ms_ns_strides),
                 typename Traits::AOp{},
                 typename Traits::BOp{},
-                typename Traits::CDEOp{alphaF, betaF}));
+                typename Traits::CDEOp(alphaF, betaF)));
 
             // Attach the workspace pointer
             deviceOp->SetWorkSpacePointer(Base::mArgPtr.get(), workspacePtr);
@@ -163,8 +167,10 @@ namespace hiptensor
     template <typename DeviceOp>
     class ContractionSolutionImpl<
         DeviceOp,
-        std::enable_if_t<std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
-                                        ck::tensor_operation::element_wise::Scale>>>
+        std::enable_if_t<(std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
+                                        ck::tensor_operation::element_wise::Scale>)
+                          || (std::is_same_v<typename MetaTraits<DeviceOp>::CDEOp,
+                                        ck::tensor_operation::element_wise::ScaleComplex>)>>
         : public ContractionSolution
     {
     public:
@@ -201,11 +207,12 @@ namespace hiptensor
             auto* deviceOp = dynamic_cast<DeviceOp*>(Base::mDeviceOp.get());
 
             // Note: CK ALWAYS uses float for alpha / beta in contraction multipleD
-            auto alphaF = 0.0f;
+            ScalarData alphaF;
 
             if(alpha != nullptr)
             {
-                alphaF = hiptensor::readVal<float>(alpha, HipDataType_v<typename Traits::EDataT>);
+                alphaF = hiptensor::readVal<ScalarData>(
+                    alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
             }
 
             // CK has its own format for indices...
@@ -229,7 +236,7 @@ namespace hiptensor
                                                           toCKVec(e_ms_ns_strides),
                                                           typename Traits::AOp{},
                                                           typename Traits::BOp{},
-                                                          typename Traits::CDEOp{alphaF}));
+                                                          typename Traits::CDEOp(alphaF)));
 
             // Attach the workspace pointer
             deviceOp->SetWorkSpacePointer(Base::mArgPtr.get(), workspacePtr);
@@ -274,7 +281,8 @@ namespace hiptensor
               typename EDataType,
               typename AElementwiseOperation,
               typename BElementwiseOperation,
-              typename CDEElementwiseOperation>
+              typename CDEElementwiseOperation,
+              typename ComputeDataType = ADataType>
     std::vector<std::unique_ptr<hiptensor::ContractionSolution>> enumerateContractionSolutions()
     {
         using ContractionOp
@@ -287,7 +295,8 @@ namespace hiptensor
                                                                        EDataType,
                                                                        AElementwiseOperation,
                                                                        BElementwiseOperation,
-                                                                       CDEElementwiseOperation>;
+                                                                       CDEElementwiseOperation,
+                                                                       ComputeDataType>;
 
         using Factory
             = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<ContractionOp>;
diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp
index fd263a8b..ad5b4408 100644
--- a/library/src/contraction/contraction_solution_instances.cpp
+++ b/library/src/contraction/contraction_solution_instances.cpp
@@ -27,11 +27,44 @@
 #include "contraction_solution_instances.hpp"
 #include "contraction_solution.hpp"
 
+// Ensure access to
+#include "device/hiptensor_contraction_bilinear_instances.hpp"
+#include "device/hiptensor_contraction_scale_instances.hpp"
+
 namespace hiptensor
 {
     ContractionSolutionInstances::ContractionSolutionInstances()
     {
         // Register all the solutions exactly once
+
+        // Bilinear bf16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::bhalf_t,
+                                          ck::bhalf_t,
+                                          ck::Tuple<ck::bhalf_t>,
+                                          ck::bhalf_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
+        // Bilinear f16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::half_t,
+                                          ck::half_t,
+                                          ck::Tuple<ck::half_t>,
+                                          ck::half_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
         // Bilinear f32
         registerSolutions(
             enumerateContractionSolutions<2,
@@ -43,7 +76,48 @@ namespace hiptensor
                                           float,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear>());
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<float>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          ck::half_t>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<float>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          ck::bhalf_t>());
+
+        // Bilinear complex f32
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          hipFloatComplex,
+                                          hipFloatComplex,
+                                          ck::Tuple<hipFloatComplex>,
+                                          hipFloatComplex,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::BilinearComplex,
+                                          hipFloatComplex>());
 
         // Bilinear f64
         registerSolutions(
@@ -56,7 +130,62 @@ namespace hiptensor
                                           double,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Bilinear>());
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          float>());
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          double,
+                                          double,
+                                          ck::Tuple<double>,
+                                          double,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Bilinear,
+                                          double>());
+
+        // Bilinear complex f64
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          hipDoubleComplex,
+                                          hipDoubleComplex,
+                                          ck::Tuple<hipDoubleComplex>,
+                                          hipDoubleComplex,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::BilinearComplex,
+                                          hipDoubleComplex>());
+
+        // Scale bf16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::bhalf_t,
+                                          ck::bhalf_t,
+                                          ck::Tuple<>,
+                                          ck::bhalf_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        // Scale f16
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          ck::half_t,
+                                          ck::half_t,
+                                          ck::Tuple<>,
+                                          ck::half_t,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
 
         // Scale f32
         registerSolutions(
@@ -69,7 +198,48 @@ namespace hiptensor
                                           float,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale>());
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          ck::half_t>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          float,
+                                          float,
+                                          ck::Tuple<>,
+                                          float,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          ck::bhalf_t>());
+
+        // scale complex f32
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          hipFloatComplex,
+                                          hipFloatComplex,
+                                          ck::Tuple<>,
+                                          hipFloatComplex,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::ScaleComplex,
+                                          hipFloatComplex>());
 
         // Scale f64
         registerSolutions(
@@ -82,6 +252,34 @@ namespace hiptensor
                                           double,
                                           ck::tensor_operation::element_wise::PassThrough,
                                           ck::tensor_operation::element_wise::PassThrough,
-                                          ck::tensor_operation::element_wise::Scale>());
+                                          ck::tensor_operation::element_wise::Scale,
+                                          float>());
+
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          double,
+                                          double,
+                                          ck::Tuple<>,
+                                          double,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::Scale,
+                                          double>());
+        // scale complex f64
+        registerSolutions(
+            enumerateContractionSolutions<2,
+                                          2,
+                                          2,
+                                          hipDoubleComplex,
+                                          hipDoubleComplex,
+                                          ck::Tuple<>,
+                                          hipDoubleComplex,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::ScaleComplex,
+                                          hipDoubleComplex>());
+
     }
 } // namespace hiptensor
diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp
index ec9de45c..4c44de88 100644
--- a/library/src/contraction/contraction_solution_params.hpp
+++ b/library/src/contraction/contraction_solution_params.hpp
@@ -49,10 +49,11 @@ namespace hiptensor
         virtual int32_t dimsK() const = 0;
 
         // Map to hipDataType
-        virtual hipDataType typeA() const = 0;
-        virtual hipDataType typeB() const = 0;
-        virtual hipDataType typeC() const = 0;
-        virtual hipDataType typeD() const = 0;
+        virtual hipDataType            typeA() const       = 0;
+        virtual hipDataType            typeB() const       = 0;
+        virtual hipDataType            typeC() const       = 0;
+        virtual hipDataType            typeD() const       = 0;
+        virtual hiptensorComputeType_t typeCompute() const = 0;
 
         // Map to operators
         virtual hiptensorOperator_t opA() const   = 0;
diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp
index bff33960..3abcaede 100644
--- a/library/src/contraction/contraction_solution_params_impl.hpp
+++ b/library/src/contraction/contraction_solution_params_impl.hpp
@@ -35,13 +35,14 @@
 namespace std
 {
     template <>
-    struct std::hash<hiptensor::ContractionSolutionParams>
+    struct hash<hiptensor::ContractionSolutionParams>
     {
-        std::size_t operator()(hiptensor::ContractionSolutionParams const& s) const noexcept
+        size_t operator()(hiptensor::ContractionSolutionParams const& s) const noexcept
         {
             return hiptensor::Hash{}(s.dimsM(),
                                      s.dimsN(),
                                      s.dimsK(),
+                                     s.typeCompute(),
                                      s.typeA(),
                                      s.typeB(),
                                      s.typeC(),
@@ -102,6 +103,11 @@ namespace hiptensor
             return HipDataType_v<typename MetaTraitsT::EDataT>;
         }
 
+        hiptensorComputeType_t typeCompute() const override
+        {
+            return convertToComputeType(HipDataType_v<typename MetaTraitsT::ComputeDataT>);
+        }
+
         hiptensorOperator_t opA() const override
         {
             return ElementWiseOperatorType_v<typename MetaTraitsT::AOp>;
diff --git a/library/src/contraction/contraction_solution_registry.cpp b/library/src/contraction/contraction_solution_registry.cpp
index 83674c81..9e2da1f9 100644
--- a/library/src/contraction/contraction_solution_registry.cpp
+++ b/library/src/contraction/contraction_solution_registry.cpp
@@ -53,19 +53,20 @@ namespace hiptensor
     }
 
     ContractionSolutionRegistry::Query
-        ContractionSolutionRegistry::Query::query(int32_t             dimsM,
-                                                  int32_t             dimsN,
-                                                  int32_t             dimsK,
-                                                  hipDataType         typeA,
-                                                  hipDataType         typeB,
-                                                  hipDataType         typeC,
-                                                  hipDataType         typeD,
-                                                  hiptensorOperator_t opA,
-                                                  hiptensorOperator_t opB,
-                                                  ContractionOpId_t   opCDE) const
+        ContractionSolutionRegistry::Query::query(int32_t                dimsM,
+                                                  int32_t                dimsN,
+                                                  int32_t                dimsK,
+                                                  hipDataType            typeA,
+                                                  hipDataType            typeB,
+                                                  hipDataType            typeC,
+                                                  hipDataType            typeD,
+                                                  hiptensorOperator_t    opA,
+                                                  hiptensorOperator_t    opB,
+                                                  ContractionOpId_t      opCDE,
+                                                  hiptensorComputeType_t typeCompute) const
     {
-        auto solutionHash
-            = hashSolution(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE);
+        auto solutionHash = hashSolution(
+            dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute);
 
         if(auto solutions = mSolutionHash.find(solutionHash); solutions != mSolutionHash.end())
         {
@@ -81,10 +82,14 @@ namespace hiptensor
         return query(hashDimsMNK(dimsM, dimsN, dimsK));
     }
 
-    ContractionSolutionRegistry::Query ContractionSolutionRegistry::Query::query(
-        hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) const
+    ContractionSolutionRegistry::Query
+        ContractionSolutionRegistry::Query::query(hipDataType            typeA,
+                                                  hipDataType            typeB,
+                                                  hipDataType            typeC,
+                                                  hipDataType            typeD,
+                                                  hiptensorComputeType_t typeCompute) const
     {
-        return query(hashTypesABCD(typeA, typeB, typeC, typeD));
+        return query(hashTypesComputeABCD(typeA, typeB, typeC, typeD, typeCompute));
     }
 
     ContractionSolutionRegistry::Query
@@ -159,18 +164,20 @@ namespace hiptensor
 
     /* static */
     ContractionSolutionRegistry::Query::HashId
-        ContractionSolutionRegistry::Query::hashSolution(int32_t             dimsM,
-                                                         int32_t             dimsN,
-                                                         int32_t             dimsK,
-                                                         hipDataType         typeA,
-                                                         hipDataType         typeB,
-                                                         hipDataType         typeC,
-                                                         hipDataType         typeD,
-                                                         hiptensorOperator_t opA,
-                                                         hiptensorOperator_t opB,
-                                                         ContractionOpId_t   opCDE)
+        ContractionSolutionRegistry::Query::hashSolution(int32_t                dimsM,
+                                                         int32_t                dimsN,
+                                                         int32_t                dimsK,
+                                                         hipDataType            typeA,
+                                                         hipDataType            typeB,
+                                                         hipDataType            typeC,
+                                                         hipDataType            typeD,
+                                                         hiptensorOperator_t    opA,
+                                                         hiptensorOperator_t    opB,
+                                                         ContractionOpId_t      opCDE,
+                                                         hiptensorComputeType_t typeCompute)
     {
-        return Hash{}(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE);
+        return Hash{}(
+            dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute);
     }
 
     /* static */
@@ -181,10 +188,14 @@ namespace hiptensor
     }
 
     /* static */
-    ContractionSolutionRegistry::Query::HashId ContractionSolutionRegistry::Query::hashTypesABCD(
-        hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD)
+    ContractionSolutionRegistry::Query::HashId
+        ContractionSolutionRegistry::Query::hashTypesComputeABCD(hipDataType            typeA,
+                                                                 hipDataType            typeB,
+                                                                 hipDataType            typeC,
+                                                                 hipDataType            typeD,
+                                                                 hiptensorComputeType_t typeCompute)
     {
-        return Hash{}(typeA, typeB, typeC, typeD);
+        return Hash{}(typeA, typeB, typeC, typeD, typeCompute);
     }
 
     /* static */
@@ -220,12 +231,16 @@ namespace hiptensor
                                              params->typeD(),
                                              params->opA(),
                                              params->opB(),
-                                             params->opCDE());
+                                             params->opCDE(),
+                                             params->typeCompute());
 
             auto dimsMNKHash = hashDimsMNK(params->dimsM(), params->dimsN(), params->dimsK());
 
-            auto typesABCDHash
-                = hashTypesABCD(params->typeA(), params->typeB(), params->typeC(), params->typeD());
+            auto typesComputeABCDHash = hashTypesComputeABCD(params->typeA(),
+                                                             params->typeB(),
+                                                             params->typeC(),
+                                                             params->typeD(),
+                                                             params->typeCompute());
 
             auto elementOpsHash = hashElementOps(params->opA(), params->opB());
 
@@ -236,7 +251,7 @@ namespace hiptensor
             mAllSolutions[solutionUid] = solution;
             mSolutionHash[solutionHash].push_back(solution);
             mSolutionHash[dimsMNKHash].push_back(solution);
-            mSolutionHash[typesABCDHash].push_back(solution);
+            mSolutionHash[typesComputeABCDHash].push_back(solution);
             mSolutionHash[elementOpsHash].push_back(solution);
             mSolutionHash[contactionOpsHash].push_back(solution);
         }
diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp
index d1b80ec5..44aaa97d 100644
--- a/library/src/contraction/contraction_solution_registry.hpp
+++ b/library/src/contraction/contraction_solution_registry.hpp
@@ -59,25 +59,27 @@ namespace hiptensor
             /// E.g. in this context, query further parameters.
 
             // By full solution type
-            Query query(int32_t             dimsM,
-                        int32_t             dimsN,
-                        int32_t             dimsK,
-                        hipDataType         typeA,
-                        hipDataType         typeB,
-                        hipDataType         typeC,
-                        hipDataType         typeD,
-                        hiptensorOperator_t opA,
-                        hiptensorOperator_t opB,
-                        ContractionOpId_t   opCDE) const;
+            Query query(int32_t                dimsM,
+                        int32_t                dimsN,
+                        int32_t                dimsK,
+                        hipDataType            typeA,
+                        hipDataType            typeB,
+                        hipDataType            typeC,
+                        hipDataType            typeD,
+                        hiptensorOperator_t    opA,
+                        hiptensorOperator_t    opB,
+                        ContractionOpId_t      opCDE,
+                        hiptensorComputeType_t typeCompute) const;
 
             // By dimensions
             Query query(int32_t dimsM, int32_t dimsN, int32_t dimsK) const;
 
             // By data types
-            Query query(hipDataType typeA,
-                        hipDataType typeB,
-                        hipDataType typeC,
-                        hipDataType typeD) const;
+            Query query(hipDataType            typeA,
+                        hipDataType            typeB,
+                        hipDataType            typeC,
+                        hipDataType            typeD,
+                        hiptensorComputeType_t typeCompute) const;
 
             // By element-wise operations
             Query query(hiptensorOperator_t opA, hiptensorOperator_t opB) const;
@@ -104,22 +106,24 @@ namespace hiptensor
             Query query(HashId queryHash) const;
 
             // Hashing helpers
-            static HashId hashSolution(int32_t             dimsM,
-                                       int32_t             dimsN,
-                                       int32_t             dimsK,
-                                       hipDataType         typeA,
-                                       hipDataType         typeB,
-                                       hipDataType         typeC,
-                                       hipDataType         typeD,
-                                       hiptensorOperator_t opA,
-                                       hiptensorOperator_t opB,
-                                       ContractionOpId_t   opCDE);
+            static HashId hashSolution(int32_t                dimsM,
+                                       int32_t                dimsN,
+                                       int32_t                dimsK,
+                                       hipDataType            typeA,
+                                       hipDataType            typeB,
+                                       hipDataType            typeC,
+                                       hipDataType            typeD,
+                                       hiptensorOperator_t    opA,
+                                       hiptensorOperator_t    opB,
+                                       ContractionOpId_t      opCDE,
+                                       hiptensorComputeType_t typeCompute);
 
             static HashId hashDimsMNK(int32_t dimsM, int32_t dimsN, int32_t dimsK);
-            static HashId hashTypesABCD(hipDataType typeA,
-                                        hipDataType typeB,
-                                        hipDataType typeC,
-                                        hipDataType typeD);
+            static HashId hashTypesComputeABCD(hipDataType            typeA,
+                                               hipDataType            typeB,
+                                               hipDataType            typeC,
+                                               hipDataType            typeD,
+                                               hiptensorComputeType_t typeCompute);
             static HashId hashElementOps(hiptensorOperator_t opA, hiptensorOperator_t opB);
             static HashId hashContractionOps(ContractionOpId_t opCDE);
 
diff --git a/library/src/contraction/contraction_types.hpp b/library/src/contraction/contraction_types.hpp
index 101d72dc..e4930726 100644
--- a/library/src/contraction/contraction_types.hpp
+++ b/library/src/contraction/contraction_types.hpp
@@ -40,6 +40,8 @@ namespace hiptensor
     {
         SCALE    = 0, ///< \f${C=\alpha\mathcal{A}\mathcal{B}}\f$
         BILINEAR = 1, ///< \f${D=\alpha\mathcal{A}\mathcal{B}+\beta\mathcal{C}}\f$
+        SCALE_COMPLEX = 2,
+        BILINEAR_COMPLEX = 3,
         UNKNOWN,
     };
 
diff --git a/library/src/contraction/contraction_types_impl.hpp b/library/src/contraction/contraction_types_impl.hpp
index d8fa0f74..070718cc 100644
--- a/library/src/contraction/contraction_types_impl.hpp
+++ b/library/src/contraction/contraction_types_impl.hpp
@@ -32,6 +32,7 @@
 #include <contraction_scale.hpp>
 #include <element_wise_operation.hpp>
 
+#include "device/device_element_wise_operation_complex.hpp"
 #include "contraction_types.hpp"
 #include <hiptensor/hiptensor_types.hpp>
 
@@ -51,12 +52,24 @@ namespace hiptensor
         static constexpr auto value = ContractionOpId_t::SCALE;
     };
 
+    template <>
+    struct ContractionOperatorType<ck::tensor_operation::element_wise::ScaleComplex>
+    {
+        static constexpr auto value = ContractionOpId_t::SCALE_COMPLEX;
+    };
+
     template <>
     struct ContractionOperatorType<ck::tensor_operation::element_wise::Bilinear>
     {
         static constexpr auto value = ContractionOpId_t::BILINEAR;
     };
 
+    template <>
+    struct ContractionOperatorType<ck::tensor_operation::element_wise::BilinearComplex>
+    {
+        static constexpr auto value = ContractionOpId_t::BILINEAR_COMPLEX;
+    };
+
 } // namespace hiptensor
 
 #endif // HIPTENSOR_CONTRACTION_TYPES_IMPL_HPP
diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt
index f2e4a0fb..b65a8ab1 100644
--- a/library/src/contraction/device/CMakeLists.txt
+++ b/library/src/contraction/device/CMakeLists.txt
@@ -24,24 +24,80 @@
  #
  ###############################################################################
 
-set(CK_CONTRACTION_INSTANCE_SOURCES
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
-   ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
-)
+ set(CK_CONTRACTION_INSTANCE_SOURCES
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+     )
 
 add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES})
 target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES})
diff --git a/library/src/contraction/device/common.hpp b/library/src/contraction/device/common.hpp
index f530b2e2..efd4866c 100644
--- a/library/src/contraction/device/common.hpp
+++ b/library/src/contraction/device/common.hpp
@@ -39,4 +39,6 @@
 #include <element_wise_operation.hpp>
 #include <gemm_specialization.hpp>
 
+#include "device_element_wise_operation_complex.hpp"
+
 #endif // CONTRACTION_DEVICE_COMMON_HPP
diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
new file mode 100644
index 00000000..307ecb1c
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp
@@ -0,0 +1,718 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP
+#define HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP
+
+#include "../contraction_pack_util.hpp"
+#include "common.hpp"
+#include <hip/hip_complex.h>
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+
+            using hiptensor::allocDevice;
+            using hiptensor::ceilDiv;
+            using hiptensor::DeviceDeleter;
+            using hiptensor::elementSpaceFromLengthsAndStrides;
+
+            using Bilinear        = ck::tensor_operation::element_wise::Bilinear;
+            using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex;
+            using Scale           = ck::tensor_operation::element_wise::Scale;
+            using ScaleComplex    = ck::tensor_operation::element_wise::ScaleComplex;
+
+            // The following is a specialization class for bilinear contractions of complex types.
+            // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
+            // the complex element type.
+            // The class implements a CK interface to wrap the 4 individual contraction operations and argument
+            // handling internally.
+            // Note: We are assuming that the data comes in as an Array of Structures (AOS) format in complex pairs.
+            // The argument initialization portion decomposes this data into structure of arrays (SOA) where the
+            // real and complex elements can be operated on separately.
+
+            // Tensor Contraction:
+            //   input : A
+            //   input : B
+            //   input : D0, D1, ...
+            //   output : E
+            //   C = a_op(A) * b_op(B)
+            //   E = cde_op(C, D0, D1, ...)
+            // Assume:
+            //   A[M0, M1, M2, ..., K0, K1, K2, ...]
+            //   B[N0, N1, N2, ..., K0, K1, K2, ...]
+            //   D[M0, M1, M2, ..., N0, N1, N2, ...]
+            //   E[M0, M1, M2, ..., N0, N1, N2, ...]
+            template <index_t NumDimM,
+                      index_t NumDimN,
+                      index_t NumDimK,
+                      typename ADataType,
+                      typename BDataType,
+                      typename AccDataType,
+                      typename CShuffleDataType,
+                      typename DsDataType,
+                      typename EDataType,
+                      typename AElementwiseOperation,
+                      typename BElementwiseOperation,
+                      GemmSpecialization GemmSpec,
+                      index_t            NumGemmKPrefetchStage,
+                      index_t            BlockSize,
+                      index_t            MPerBlock,
+                      index_t            NPerBlock,
+                      index_t            KPerBlock,
+                      index_t            AK1,
+                      index_t            BK1,
+                      index_t            MPerXDL,
+                      index_t            NPerXDL,
+                      index_t            MXdlPerWave,
+                      index_t            NXdlPerWave,
+                      typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                      typename ABlockTransferThreadClusterArrangeOrder,
+                      typename ABlockTransferSrcAccessOrder,
+                      index_t ABlockTransferSrcVectorDim,
+                      index_t ABlockTransferSrcScalarPerVector,
+                      index_t ABlockTransferDstScalarPerVector_AK1,
+                      bool    ABlockLdsExtraM,
+                      typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                      typename BBlockTransferThreadClusterArrangeOrder,
+                      typename BBlockTransferSrcAccessOrder,
+                      index_t BBlockTransferSrcVectorDim,
+                      index_t BBlockTransferSrcScalarPerVector,
+                      index_t BBlockTransferDstScalarPerVector_BK1,
+                      bool    BBlockLdsExtraN,
+                      index_t CShuffleMXdlPerWavePerShuffle,
+                      index_t CShuffleNXdlPerWavePerShuffle,
+                      typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                      index_t CDEBlockTransferScalarPerVector_NPerBlock,
+                      typename ComputeDataType,
+                      LoopScheduler LoopSched>
+            struct DeviceContractionMultipleD_Xdl_CShuffle<
+                NumDimM,
+                NumDimN,
+                NumDimK,
+                HIP_vector_type<ADataType, 2>,
+                HIP_vector_type<BDataType, 2>,
+                AccDataType,
+                CShuffleDataType,
+                ck::Tuple<HIP_vector_type<DsDataType, 2>>,
+                HIP_vector_type<EDataType, 2>,
+                AElementwiseOperation,
+                BElementwiseOperation,
+                BilinearComplex,
+                GemmSpec,
+                NumGemmKPrefetchStage,
+                BlockSize,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                AK1,
+                BK1,
+                MPerXDL,
+                NPerXDL,
+                MXdlPerWave,
+                NXdlPerWave,
+                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                ABlockTransferThreadClusterArrangeOrder,
+                ABlockTransferSrcAccessOrder,
+                ABlockTransferSrcVectorDim,
+                ABlockTransferSrcScalarPerVector,
+                ABlockTransferDstScalarPerVector_AK1,
+                ABlockLdsExtraM,
+                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                BBlockTransferThreadClusterArrangeOrder,
+                BBlockTransferSrcAccessOrder,
+                BBlockTransferSrcVectorDim,
+                BBlockTransferSrcScalarPerVector,
+                BBlockTransferDstScalarPerVector_BK1,
+                BBlockLdsExtraN,
+                CShuffleMXdlPerWavePerShuffle,
+                CShuffleNXdlPerWavePerShuffle,
+                CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                CDEBlockTransferScalarPerVector_NPerBlock,
+                HIP_vector_type<ComputeDataType, 2>,
+                LoopSched>
+
+                : public DeviceContractionMultipleD<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    HIP_vector_type<ADataType, 2>,
+                                                    HIP_vector_type<BDataType, 2>,
+                                                    ck::Tuple<HIP_vector_type<DsDataType, 2>>,
+                                                    HIP_vector_type<EDataType, 2>,
+                                                    AElementwiseOperation,
+                                                    BElementwiseOperation,
+                                                    BilinearComplex,
+                                                    HIP_vector_type<ComputeDataType, 2>>
+            {
+                // Complex device Op
+                using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
+
+                // CDE Operations
+                using ScaleCDEElementwiseOperation          = ScaleComplex;
+                using DecompScaleCDEElementwiseOperation    = Scale;
+                using BilinearCDEElementwiseOperation       = BilinearComplex;
+                using DecompBilinearCDEElementwiseOperation = Bilinear;
+
+                // Complex types given through the interface
+                using ComplexA       = HIP_vector_type<ADataType, 2>;
+                using ComplexB       = HIP_vector_type<BDataType, 2>;
+                using ComplexDs      = HIP_vector_type<DsDataType, 2>;
+                using ComplexE       = HIP_vector_type<EDataType, 2>;
+                using ComplexCompute = HIP_vector_type<ComputeDataType, 2>;
+
+                // Internal functional types we will use to
+                // decompose the complex types and operate on.
+                using DecompA       = ADataType;
+                using DecompB       = BDataType;
+                using DecompDs      = DsDataType;
+                using DecompE       = EDataType;
+                using DecompCompute = ComputeDataType;
+
+                // For complex types, we need to make sure that all of the types are the same
+                static_assert(std::is_same_v<DecompA, DecompB> && std::is_same_v<DecompB, DecompDs>
+                                  && std::is_same_v<DecompDs, DecompE>
+                                  && std::is_same_v<DecompE, DecompCompute>
+                                  && std::is_same_v<DecompCompute, CShuffleDataType>,
+                              "Complex operations must have the same data type");
+
+                static_assert(std::is_same_v<DecompA, float> || std::is_same_v<DecompA, double>,
+                              "Complex operations only supported with single or double precision");
+
+                static constexpr index_t NumDTensor = 1;
+
+                // The internal operation that we will decompose the complex operations with.
+                // For complex will be either float or double
+                using ScaleDecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    DecompA,
+                    DecompB,
+                    AccDataType,
+                    CShuffleDataType,
+                    ck::Tuple<>,
+                    DecompE,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    DecompScaleCDEElementwiseOperation,
+                    GemmSpec,
+                    NumGemmKPrefetchStage,
+                    BlockSize,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    AK1,
+                    BK1,
+                    MPerXDL,
+                    NPerXDL,
+                    MXdlPerWave,
+                    NXdlPerWave,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    ABlockTransferSrcVectorDim,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    ABlockLdsExtraM,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    BBlockTransferSrcVectorDim,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    BBlockLdsExtraN,
+                    CShuffleMXdlPerWavePerShuffle,
+                    CShuffleNXdlPerWavePerShuffle,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    CDEBlockTransferScalarPerVector_NPerBlock,
+                    DecompCompute,
+                    LoopSched>;
+
+                // The internal operation that we will decompose the complex operations with.
+                // For complex will be either float or double
+                using BilinearDecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    DecompA,
+                    DecompB,
+                    AccDataType,
+                    CShuffleDataType,
+                    ck::Tuple<DecompDs>,
+                    DecompE,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    DecompBilinearCDEElementwiseOperation,
+                    GemmSpec,
+                    NumGemmKPrefetchStage,
+                    BlockSize,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    AK1,
+                    BK1,
+                    MPerXDL,
+                    NPerXDL,
+                    MXdlPerWave,
+                    NXdlPerWave,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    ABlockTransferSrcVectorDim,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    ABlockLdsExtraM,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    BBlockTransferSrcVectorDim,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    BBlockLdsExtraN,
+                    CShuffleMXdlPerWavePerShuffle,
+                    CShuffleNXdlPerWavePerShuffle,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    CDEBlockTransferScalarPerVector_NPerBlock,
+                    DecompCompute,
+                    LoopSched>;
+
+                // Argument
+                struct Argument : public BaseArgument
+                {
+                    using ScaleDecompArgument    = typename ScaleDecompOp::Argument;
+                    using BilinearDecompArgument = typename BilinearDecompOp::Argument;
+
+                    Argument(Argument&& other)
+                        : mScaleArgs(
+                            {std::move(other.mScaleArgs[0]), std::move(other.mScaleArgs[1])})
+                        , mBilinearArgs({std::move(other.mBilinearArgs[0]),
+                                         std::move(other.mBilinearArgs[1])})
+                    {
+                    }
+
+                    Argument& operator=(Argument&& other)
+                    {
+                        if(this != &other)
+                        {
+                            mScaleArgs[0]    = std::move(other.mScaleArgs[0]);
+                            mScaleArgs[1]    = std::move(other.mScaleArgs[1]);
+                            mBilinearArgs[0] = std::move(other.mBilinearArgs[0]);
+                            mBilinearArgs[1] = std::move(other.mBilinearArgs[1]);
+                        }
+                        return *this;
+                    }
+
+                    Argument(const void*                                         p_a_grid,
+                             const void*                                         p_b_grid,
+                             std::array<const void*, NumDTensor>                 p_ds_grid,
+                             void*                                               p_e_grid,
+                             const std::vector<index_t>&                         a_ms_ks_lengths,
+                             const std::vector<index_t>&                         a_ms_ks_strides,
+                             const std::vector<index_t>&                         b_ns_ks_lengths,
+                             const std::vector<index_t>&                         b_ns_ks_strides,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                             const std::vector<index_t>&                         e_ms_ns_lengths,
+                             const std::vector<index_t>&                         e_ms_ns_strides,
+                             AElementwiseOperation                               a_element_op,
+                             BElementwiseOperation                               b_element_op,
+                             BilinearCDEElementwiseOperation                     cde_element_op)
+                        : element_op(cde_element_op)
+                    {
+                        // Take the incoming arguments, treat them as complex.
+
+                        // Allocate Real and Imaginary inputs
+                        auto elementsA
+                            = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides);
+                        auto elementsB
+                            = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides);
+                        auto elementsD = elementSpaceFromLengthsAndStrides(ds_ms_ns_lengths[0],
+                                                                           ds_ms_ns_strides[0]);
+                        elementsE
+                            = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
+
+                        mA_real.reset(nullptr);
+                        mA_imag.reset(nullptr);
+                        mB_real.reset(nullptr);
+                        mB_imag.reset(nullptr);
+                        mD_real.reset(nullptr);
+                        mD_imag.reset(nullptr);
+                        mE_real.reset(nullptr);
+                        mE_imag.reset(nullptr);
+
+                        mE_grid       = p_e_grid;
+                        auto blockDim = dim3(1024);
+
+                        auto decompGrid = [blockDim](auto&       out_r,
+                                                     auto&       out_i,
+                                                     auto const* input_grid,
+                                                     uint32_t    elementCount) {
+                            using DecompT = typename std::decay_t<decltype(out_r)>::element_type;
+                            static_assert(std::is_same_v<
+                                              DecompT,
+                                              typename std::decay_t<decltype(out_i)>::element_type>,
+                                          "r and i buffers must be same type");
+
+                            if(input_grid != nullptr)
+                            {
+                                out_r = std::move(allocDevice<DecompT>(elementCount));
+                                out_i = std::move(allocDevice<DecompT>(elementCount));
+
+                                auto gridDim = dim3(ceilDiv(elementCount, blockDim.x));
+                                hiptensor::unpack<<<gridDim, blockDim, 0>>>(
+                                    input_grid, out_r.get(), out_i.get(), elementCount);
+                            }
+                        };
+
+                        // Decompose the incoming data from AOS->SOA
+                        decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA);
+                        decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB);
+                        decompGrid(mD_real, mD_imag, (const ComplexDs*)p_ds_grid[0], elementsD);
+                        decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE);
+
+                        auto allocScaleArgs = [a_ms_ks_lengths,
+                                               a_ms_ks_strides,
+                                               b_ns_ks_lengths,
+                                               b_ns_ks_strides,
+                                               e_ms_ns_lengths,
+                                               e_ms_ns_strides,
+                                               a_element_op,
+                                               b_element_op](auto&       out_e,
+                                                             auto const& in_a,
+                                                             auto const& in_b,
+                                                             auto const& cde_element_op) {
+                            return std::make_unique<ScaleDecompArgument>(
+                                in_a.get(),
+                                in_b.get(),
+                                std::array<void const*, 0>{},
+                                out_e.get(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                std::array<std::vector<index_t>, 0>{},
+                                std::array<std::vector<index_t>, 0>{},
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+                        };
+
+                        auto allocBilinearArgs = [a_ms_ks_lengths,
+                                                  a_ms_ks_strides,
+                                                  b_ns_ks_lengths,
+                                                  b_ns_ks_strides,
+                                                  e_ms_ns_lengths,
+                                                  e_ms_ns_strides,
+                                                  a_element_op,
+                                                  b_element_op](auto&       out_e,
+                                                                auto const& in_a,
+                                                                auto const& in_b,
+                                                                auto const& in_d,
+                                                                auto const& cde_element_op) {
+                            return std::make_unique<BilinearDecompArgument>(
+                                in_a.get(),
+                                in_b.get(),
+                                std::array<void const*, 1>{in_d.get()},
+                                out_e.get(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                std::array<std::vector<index_t>, 1>{e_ms_ns_lengths},
+                                std::array<std::vector<index_t>, 1>{e_ms_ns_strides},
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+                        };
+
+                        mScaleArgs[0] = allocScaleArgs(
+                            mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[0]
+                            = allocBilinearArgs(mE_real,
+                                                mA_imag,
+                                                mB_imag,
+                                                mE_real,
+                                                DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f});
+
+                        mScaleArgs[1] = allocScaleArgs(
+                            mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[1]
+                            = allocBilinearArgs(mE_imag,
+                                                mA_imag,
+                                                mB_real,
+                                                mE_imag,
+                                                DecompBilinearCDEElementwiseOperation{1.0f, 1.0f});
+                    }
+
+                    void Print() const
+                    {
+                        std::cout << "ScaleArgs0:" << std::endl;
+                        mScaleArgs[0]->Print();
+                        std::cout << "ScaleArgs1:" << std::endl;
+                        mScaleArgs[1]->Print();
+                        std::cout << "BilinearArgs0:" << std::endl;
+                        mBilinearArgs[0]->Print();
+                        std::cout << "BilinearArgs1:" << std::endl;
+                        mBilinearArgs[1]->Print();
+                    }
+
+                    //  private:
+                    // Each argument set for complex:
+                    std::unique_ptr<ScaleDecompArgument>    mScaleArgs[2];
+                    std::unique_ptr<BilinearDecompArgument> mBilinearArgs[2];
+
+                    template <typename DataT>
+                    using DeviceArray = std::unique_ptr<DataT, DeviceDeleter>;
+
+                    // Manage extra memory for AOS->SOA
+                    DeviceArray<DecompA>  mA_real;
+                    DeviceArray<DecompA>  mA_imag;
+                    DeviceArray<DecompB>  mB_real;
+                    DeviceArray<DecompB>  mB_imag;
+                    DeviceArray<DecompDs> mD_real;
+                    DeviceArray<DecompDs> mD_imag;
+                    DeviceArray<DecompE>  mE_real;
+                    DeviceArray<DecompE>  mE_imag;
+
+                    BilinearCDEElementwiseOperation element_op;
+                    void*                           mE_grid;
+                    index_t                         elementsE;
+                };
+
+                // Invoker
+                struct Invoker : public BaseInvoker
+                {
+                    using Argument = typename DeviceOp::Argument;
+
+                    Invoker()
+                        : mScaleInvoker(std::make_unique<typename ScaleDecompOp::Invoker>())
+                        , mBilinearInvoker(std::make_unique<typename BilinearDecompOp::Invoker>())
+                    {
+                    }
+
+                    Invoker(Invoker&& other)
+                        : mScaleInvoker(std::move(other.mScaleInvoker))
+                        , mBilinearInvoker(std::move(other.mBilinearInvoker))
+                    {
+                    }
+
+                    Invoker& operator=(Invoker&& other)
+                    {
+                        if(this != &other)
+                        {
+                            mScaleInvoker    = std::move(other.mScaleInvoker);
+                            mBilinearInvoker = std::move(other.mBilinearInvoker);
+                        }
+                        return *this;
+                    }
+
+                    float Run(const Argument&     arg,
+                              const StreamConfig& stream_config = StreamConfig{})
+                    {
+                        auto r0 = mScaleInvoker->Run(arg.mScaleArgs[0].get(), stream_config);
+                        auto r1 = mScaleInvoker->Run(arg.mScaleArgs[1].get(), stream_config);
+                        auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config);
+                        auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config);
+
+                        if(arg.mE_grid != nullptr)
+                        {
+                            auto blockDim = dim3(1024);
+                            auto gridDim  = dim3(ceilDiv(arg.elementsE, blockDim.x));
+                            hiptensor::mfma<<<gridDim, blockDim, 0>>>(arg.mE_real.get(),
+                                                                      arg.mE_imag.get(),
+                                                                      arg.mD_real.get(),
+                                                                      arg.mD_imag.get(),
+                                                                      ((ComplexE*)arg.mE_grid),
+                                                                      arg.element_op.alpha_,
+                                                                      arg.element_op.beta_,
+                                                                      arg.elementsE);
+                        }
+
+                        return r0 + r1 + r2 + r3;
+                    }
+
+                    // polymorphic
+                    float Run(const BaseArgument* p_arg,
+                              const StreamConfig& stream_config = StreamConfig{}) override
+                    {
+                        return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+                    }
+
+                    std::unique_ptr<typename ScaleDecompOp::Invoker>    mScaleInvoker;
+                    std::unique_ptr<typename BilinearDecompOp::Invoker> mBilinearInvoker;
+                };
+
+                static bool IsSupportedArgument(const Argument& arg)
+                {
+                    return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[0].get()))
+                           && ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[1].get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get()));
+                }
+
+                // polymorphic
+                bool IsSupportedArgument(const BaseArgument* p_arg) override
+                {
+                    return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+                }
+
+                // polymorphic
+                virtual void SetWorkSpacePointer(BaseArgument*       p_arg,
+                                                 void*               p_workspace,
+                                                 StreamConfig const& s
+                                                 = StreamConfig{}) const override
+                {
+                    // Call the base, then fwd to each arg.
+                    this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s);
+                    auto* arg = dynamic_cast<Argument*>(p_arg);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mScaleArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mScaleArgs[1].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[1].get(), p_workspace, s);
+                }
+
+                static auto MakeArgument(
+                    const void*                                         p_a,
+                    const void*                                         p_b,
+                    std::array<const void*, NumDTensor>                 p_ds,
+                    void*                                               p_e,
+                    const std::vector<index_t>&                         a_ms_ks_lengths,
+                    const std::vector<index_t>&                         a_ms_ks_strides,
+                    const std::vector<index_t>&                         b_ns_ks_lengths,
+                    const std::vector<index_t>&                         b_ns_ks_strides,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                    const std::vector<index_t>&                         e_ms_ns_lengths,
+                    const std::vector<index_t>&                         e_ms_ns_strides,
+                    AElementwiseOperation                               a_element_op,
+                    BElementwiseOperation                               b_element_op,
+                    BilinearCDEElementwiseOperation                     cde_element_op)
+                {
+                    return Argument{p_a,
+                                    p_b,
+                                    p_ds,
+                                    p_e,
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    ds_ms_ns_lengths,
+                                    ds_ms_ns_strides,
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op};
+                }
+
+                static auto MakeInvoker()
+                {
+                    return Invoker{};
+                }
+
+                // polymorphic
+                std::unique_ptr<BaseArgument> MakeArgumentPointer(
+                    const void*                                         p_a,
+                    const void*                                         p_b,
+                    std::array<const void*, NumDTensor>                 p_ds,
+                    void*                                               p_e,
+                    const std::vector<index_t>&                         a_ms_ks_lengths,
+                    const std::vector<index_t>&                         a_ms_ks_strides,
+                    const std::vector<index_t>&                         b_ns_ks_lengths,
+                    const std::vector<index_t>&                         b_ns_ks_strides,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                    const std::vector<index_t>&                         e_ms_ns_lengths,
+                    const std::vector<index_t>&                         e_ms_ns_strides,
+                    AElementwiseOperation                               a_element_op,
+                    BElementwiseOperation                               b_element_op,
+                    BilinearCDEElementwiseOperation                     cde_element_op) override
+                {
+                    return std::make_unique<Argument>(p_a,
+                                                      p_b,
+                                                      p_ds,
+                                                      p_e,
+                                                      a_ms_ks_lengths,
+                                                      a_ms_ks_strides,
+                                                      b_ns_ks_lengths,
+                                                      b_ns_ks_strides,
+                                                      ds_ms_ns_lengths,
+                                                      ds_ms_ns_strides,
+                                                      e_ms_ns_lengths,
+                                                      e_ms_ns_strides,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      cde_element_op);
+                }
+
+                // polymorphic
+                std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+                {
+                    return std::make_unique<Invoker>(Invoker{});
+                }
+
+                // polymorphic
+                std::string GetTypeString() const override
+                {
+                    auto str = std::stringstream();
+
+                    // clang-format off
+        str << "DeviceContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+                    // clang-format on
+
+                    return str.str();
+                }
+            };
+
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
+
+#endif // HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..3b3f6d47
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance
+                    = device_contraction_kk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..fd43f0ad
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance
+                    = device_contraction_kn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..21fb8127
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance
+                    = device_contraction_mk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..cc975c03
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance
+                    = device_contraction_mn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     BF16_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               BF16_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
new file mode 100644
index 00000000..4601021e
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather
+// than using default setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter
+// of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32               = float;
+                using CF32              = hipFloatComplex;
+                using CF32_Tuple        = ck::Tuple<CF32>;
+                using BilinearComplex   = element_wise::BilinearComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance
+                    = device_contraction_kk_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     CF32_Tuple,
+                                                     CF32,
+                                                     CF32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     BilinearComplex>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
new file mode 100644
index 00000000..e3f60146
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32               = float;
+                using CF32              = hipFloatComplex;
+                using CF32_Tuple        = ck::Tuple<CF32>;
+                using BilinearComplex   = element_wise::BilinearComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance
+                    = device_contraction_kn_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     CF32_Tuple,
+                                                     CF32,
+                                                     CF32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     BilinearComplex>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
new file mode 100644
index 00000000..c2fd7c84
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32               = float;
+                using CF32              = hipFloatComplex;
+                using CF32_Tuple        = ck::Tuple<CF32>;
+                using BilinearComplex   = element_wise::BilinearComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance
+                    = device_contraction_mk_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     CF32_Tuple,
+                                                     CF32,
+                                                     CF32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     BilinearComplex>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
new file mode 100644
index 00000000..8203a4e5
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32               = float;
+                using CF32              = hipFloatComplex;
+                using CF32_Tuple        = ck::Tuple<CF32>;
+                using BilinearComplex   = element_wise::BilinearComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance
+                    = device_contraction_mn_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     CF32_Tuple,
+                                                     CF32,
+                                                     CF32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     BilinearComplex>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
new file mode 100644
index 00000000..9d779671
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather
+// than using default setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter
+// of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64             = double;
+                using CF64            = hipDoubleComplex;
+                using CF64_Tuple      = ck::Tuple<CF64>;
+                using BilinearComplex = element_wise::BilinearComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance
+                    = device_contraction_f64_kk_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         CF64_Tuple,
+                                                         CF64,
+                                                         CF64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         BilinearComplex>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
new file mode 100644
index 00000000..4197dda2
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64             = double;
+                using CF64            = hipDoubleComplex;
+                using CF64_Tuple      = ck::Tuple<CF64>;
+                using BilinearComplex = element_wise::BilinearComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance
+                    = device_contraction_f64_kn_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         CF64_Tuple,
+                                                         CF64,
+                                                         CF64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         BilinearComplex>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
new file mode 100644
index 00000000..cc519368
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64               = double;
+                using CF64              = hipDoubleComplex;
+                using CF64_Tuple        = ck::Tuple<CF64>;
+                using BilinearComplex   = element_wise::BilinearComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance
+                    = device_contraction_f64_mk_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         CF64_Tuple,
+                                                         CF64,
+                                                         CF64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         BilinearComplex>;
+
+                void
+                   add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
new file mode 100644
index 00000000..ff187398
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_bilinear_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64               = double;
+                using CF64              = hipDoubleComplex;
+                using CF64_Tuple        = ck::Tuple<CF64>;
+                using BilinearComplex   = element_wise::BilinearComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance
+                    = device_contraction_f64_mn_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         CF64_Tuple,
+                                                         CF64,
+                                                         CF64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         BilinearComplex>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..ff670630
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance
+                    = device_contraction_kk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..be8bfe84
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance
+                    = device_contraction_kn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..4be69898
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance
+                    = device_contraction_mk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..2f6d630b
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance
+                    = device_contraction_mn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     F16_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               F16_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
new file mode 100644
index 00000000..cc21216c
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
new file mode 100644
index 00000000..57c47457
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
new file mode 100644
index 00000000..a121fbb3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
new file mode 100644
index 00000000..7962da9f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
new file mode 100644
index 00000000..ea2be147
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
new file mode 100644
index 00000000..d82ea442
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
new file mode 100644
index 00000000..772df2e3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
new file mode 100644
index 00000000..8b1d0681
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               F32_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
index d8b80eb9..f924889f 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,42 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
@@ -89,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
index 5444adc3..ad94eb1f 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
index b20c1204..8fb870a0 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
index 2bc3d1f2..aa3e9d32 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32       = float;
-                using F32_Tuple = ck::Tuple<F32>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     F32_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
@@ -92,8 +71,8 @@ namespace ck
                                                                                F32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
new file mode 100644
index 00000000..ad5ce461
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
new file mode 100644
index 00000000..ae3ee856
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
new file mode 100644
index 00000000..b72005ad
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..b94030e5
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
+
+                void
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               F64_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Bilinear,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
index a1fe1ddf..a65ae1eb 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
index a635bce8..4d6ccaa8 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
index c77ffea4..071ccf62 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
index c8a96a70..d8223df7 100644
--- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64       = double;
-                using F64_Tuple = ck::Tuple<F64>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, F64_Tuple,   F64,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         F64_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Bilinear>;
 
                 void
                     add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
@@ -86,8 +71,8 @@ namespace ck
                                                                                F64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances)
+                                                                               Bilinear,
+                                                                               F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp
new file mode 100644
index 00000000..5b70cc11
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_complex.hpp
@@ -0,0 +1,707 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP
+#define HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP
+
+#include "../contraction_pack_util.hpp"
+#include "common.hpp"
+#include <hip/hip_complex.h>
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+
+            using hiptensor::allocDevice;
+            using hiptensor::ceilDiv;
+            using hiptensor::DeviceDeleter;
+            using hiptensor::elementSpaceFromLengthsAndStrides;
+
+            using Bilinear        = ck::tensor_operation::element_wise::Bilinear;
+            using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex;
+            using Scale           = ck::tensor_operation::element_wise::Scale;
+            using ScaleComplex    = ck::tensor_operation::element_wise::ScaleComplex;
+
+            // The following is a specialization class for bilinear contractions of complex types.
+            // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of
+            // the complex element type.
+            // The class implements a CK interface to wrap the 4 individual contraction operations and argument
+            // handling internally.
+            // Note: We are assuming that the data comes in as an Array of Structures (AOS) format in complex pairs.
+            // The argument initialization portion decomposes this data into structure of arrays (SOA) where the
+            // real and complex elements can be operated on separately.
+
+            // Tensor Contraction:
+            //   input : A
+            //   input : B
+            //   input : D0, D1, ...
+            //   output : E
+            //   C = a_op(A) * b_op(B)
+            //   E = cde_op(C, D0, D1, ...)
+            // Assume:
+            //   A[M0, M1, M2, ..., K0, K1, K2, ...]
+            //   B[N0, N1, N2, ..., K0, K1, K2, ...]
+            //   D[M0, M1, M2, ..., N0, N1, N2, ...]
+            //   E[M0, M1, M2, ..., N0, N1, N2, ...]
+            template <index_t NumDimM,
+                      index_t NumDimN,
+                      index_t NumDimK,
+                      typename ADataType,
+                      typename BDataType,
+                      typename AccDataType,
+                      typename CShuffleDataType,
+                      typename EDataType,
+                      typename AElementwiseOperation,
+                      typename BElementwiseOperation,
+                      GemmSpecialization GemmSpec,
+                      index_t            NumGemmKPrefetchStage,
+                      index_t            BlockSize,
+                      index_t            MPerBlock,
+                      index_t            NPerBlock,
+                      index_t            KPerBlock,
+                      index_t            AK1,
+                      index_t            BK1,
+                      index_t            MPerXDL,
+                      index_t            NPerXDL,
+                      index_t            MXdlPerWave,
+                      index_t            NXdlPerWave,
+                      typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                      typename ABlockTransferThreadClusterArrangeOrder,
+                      typename ABlockTransferSrcAccessOrder,
+                      index_t ABlockTransferSrcVectorDim,
+                      index_t ABlockTransferSrcScalarPerVector,
+                      index_t ABlockTransferDstScalarPerVector_AK1,
+                      bool    ABlockLdsExtraM,
+                      typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                      typename BBlockTransferThreadClusterArrangeOrder,
+                      typename BBlockTransferSrcAccessOrder,
+                      index_t BBlockTransferSrcVectorDim,
+                      index_t BBlockTransferSrcScalarPerVector,
+                      index_t BBlockTransferDstScalarPerVector_BK1,
+                      bool    BBlockLdsExtraN,
+                      index_t CShuffleMXdlPerWavePerShuffle,
+                      index_t CShuffleNXdlPerWavePerShuffle,
+                      typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                      index_t CDEBlockTransferScalarPerVector_NPerBlock,
+                      typename ComputeDataType,
+                      LoopScheduler LoopSched>
+            struct DeviceContractionMultipleD_Xdl_CShuffle<
+                NumDimM,
+                NumDimN,
+                NumDimK,
+                HIP_vector_type<ADataType, 2>,
+                HIP_vector_type<BDataType, 2>,
+                AccDataType,
+                CShuffleDataType,
+                ck::Tuple<>,
+                HIP_vector_type<EDataType, 2>,
+                AElementwiseOperation,
+                BElementwiseOperation,
+                ScaleComplex,
+                GemmSpec,
+                NumGemmKPrefetchStage,
+                BlockSize,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                AK1,
+                BK1,
+                MPerXDL,
+                NPerXDL,
+                MXdlPerWave,
+                NXdlPerWave,
+                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                ABlockTransferThreadClusterArrangeOrder,
+                ABlockTransferSrcAccessOrder,
+                ABlockTransferSrcVectorDim,
+                ABlockTransferSrcScalarPerVector,
+                ABlockTransferDstScalarPerVector_AK1,
+                ABlockLdsExtraM,
+                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                BBlockTransferThreadClusterArrangeOrder,
+                BBlockTransferSrcAccessOrder,
+                BBlockTransferSrcVectorDim,
+                BBlockTransferSrcScalarPerVector,
+                BBlockTransferDstScalarPerVector_BK1,
+                BBlockLdsExtraN,
+                CShuffleMXdlPerWavePerShuffle,
+                CShuffleNXdlPerWavePerShuffle,
+                CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                CDEBlockTransferScalarPerVector_NPerBlock,
+                HIP_vector_type<ComputeDataType, 2>,
+                LoopSched>
+
+                : public DeviceContractionMultipleD<NumDimM,
+                                                    NumDimN,
+                                                    NumDimK,
+                                                    HIP_vector_type<ADataType, 2>,
+                                                    HIP_vector_type<BDataType, 2>,
+                                                    ck::Tuple<>,
+                                                    HIP_vector_type<EDataType, 2>,
+                                                    AElementwiseOperation,
+                                                    BElementwiseOperation,
+                                                    ScaleComplex,
+                                                    HIP_vector_type<ComputeDataType, 2>>
+            {
+                // Complex device Op
+                using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
+
+                // CDE Operations
+                using ScaleCDEElementwiseOperation          = ScaleComplex;
+                using DecompScaleCDEElementwiseOperation    = Scale;
+                using BilinearCDEElementwiseOperation       = BilinearComplex;
+                using DecompBilinearCDEElementwiseOperation = Bilinear;
+
+                // Complex types given through the interface
+                using ComplexA       = HIP_vector_type<ADataType, 2>;
+                using ComplexB       = HIP_vector_type<BDataType, 2>;
+                using ComplexDs      = HIP_vector_type<EDataType, 2>;
+                using ComplexE       = HIP_vector_type<EDataType, 2>;
+                using ComplexCompute = HIP_vector_type<ComputeDataType, 2>;
+
+                // Internal functional types we will use to
+                // decompose the complex types and operate on.
+                using DecompA       = ADataType;
+                using DecompB       = BDataType;
+                using DecompDs      = EDataType;
+                using DecompE       = EDataType;
+                using DecompCompute = ComputeDataType;
+
+                // For complex types, we need to make sure that all of the types are the same
+                static_assert(std::is_same_v<DecompA, DecompB> && std::is_same_v<DecompB, DecompE>
+                                  && std::is_same_v<DecompE, CShuffleDataType>
+                                  && std::is_same_v<DecompE, DecompCompute>,
+                              "Complex operations must have the same data type");
+
+                static_assert(std::is_same_v<DecompA, float> || std::is_same_v<DecompA, double>,
+                              "Complex operations only supported with single or double precision");
+
+                static constexpr index_t NumDTensor = 0;
+
+                // The internal operation that we will decompose the complex operations with.
+                // For complex will be either float or double
+                using ScaleDecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    DecompA,
+                    DecompB,
+                    AccDataType,
+                    CShuffleDataType,
+                    ck::Tuple<>,
+                    DecompE,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    DecompScaleCDEElementwiseOperation,
+                    GemmSpec,
+                    NumGemmKPrefetchStage,
+                    BlockSize,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    AK1,
+                    BK1,
+                    MPerXDL,
+                    NPerXDL,
+                    MXdlPerWave,
+                    NXdlPerWave,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    ABlockTransferSrcVectorDim,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    ABlockLdsExtraM,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    BBlockTransferSrcVectorDim,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    BBlockLdsExtraN,
+                    CShuffleMXdlPerWavePerShuffle,
+                    CShuffleNXdlPerWavePerShuffle,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    CDEBlockTransferScalarPerVector_NPerBlock,
+                    DecompCompute,
+                    LoopSched>;
+
+                // The internal operation that we will decompose the complex operations with.
+                // For complex will be either float or double
+                using BilinearDecompOp = DeviceContractionMultipleD_Xdl_CShuffle<
+                    NumDimM,
+                    NumDimN,
+                    NumDimK,
+                    DecompA,
+                    DecompB,
+                    AccDataType,
+                    CShuffleDataType,
+                    ck::Tuple<DecompDs>,
+                    DecompE,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    DecompBilinearCDEElementwiseOperation,
+                    GemmSpec,
+                    NumGemmKPrefetchStage,
+                    BlockSize,
+                    MPerBlock,
+                    NPerBlock,
+                    KPerBlock,
+                    AK1,
+                    BK1,
+                    MPerXDL,
+                    NPerXDL,
+                    MXdlPerWave,
+                    NXdlPerWave,
+                    ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                    ABlockTransferThreadClusterArrangeOrder,
+                    ABlockTransferSrcAccessOrder,
+                    ABlockTransferSrcVectorDim,
+                    ABlockTransferSrcScalarPerVector,
+                    ABlockTransferDstScalarPerVector_AK1,
+                    ABlockLdsExtraM,
+                    BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                    BBlockTransferThreadClusterArrangeOrder,
+                    BBlockTransferSrcAccessOrder,
+                    BBlockTransferSrcVectorDim,
+                    BBlockTransferSrcScalarPerVector,
+                    BBlockTransferDstScalarPerVector_BK1,
+                    BBlockLdsExtraN,
+                    CShuffleMXdlPerWavePerShuffle,
+                    CShuffleNXdlPerWavePerShuffle,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    CDEBlockTransferScalarPerVector_NPerBlock,
+                    DecompCompute,
+                    LoopSched>;
+
+                // Argument
+                struct Argument : public BaseArgument
+                {
+                    using ScaleDecompArgument    = typename ScaleDecompOp::Argument;
+                    using BilinearDecompArgument = typename BilinearDecompOp::Argument;
+
+                    Argument(Argument&& other)
+                        : mScaleArgs(
+                            {std::move(other.mScaleArgs[0]), std::move(other.mScaleArgs[1])})
+                        , mBilinearArgs({std::move(other.mBilinearArgs[0]),
+                                         std::move(other.mBilinearArgs[1])})
+                    {
+                    }
+
+                    Argument& operator=(Argument&& other)
+                    {
+                        if(this != &other)
+                        {
+                            mScaleArgs[0]    = std::move(other.mScaleArgs[0]);
+                            mScaleArgs[1]    = std::move(other.mScaleArgs[1]);
+                            mBilinearArgs[0] = std::move(other.mBilinearArgs[0]);
+                            mBilinearArgs[1] = std::move(other.mBilinearArgs[1]);
+                        }
+                        return *this;
+                    }
+
+                    Argument(const void*                                         p_a_grid,
+                             const void*                                         p_b_grid,
+                             std::array<const void*, NumDTensor>                 p_ds_grid,
+                             void*                                               p_e_grid,
+                             const std::vector<index_t>&                         a_ms_ks_lengths,
+                             const std::vector<index_t>&                         a_ms_ks_strides,
+                             const std::vector<index_t>&                         b_ns_ks_lengths,
+                             const std::vector<index_t>&                         b_ns_ks_strides,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                             const std::vector<index_t>&                         e_ms_ns_lengths,
+                             const std::vector<index_t>&                         e_ms_ns_strides,
+                             AElementwiseOperation                               a_element_op,
+                             BElementwiseOperation                               b_element_op,
+                             ScaleCDEElementwiseOperation                        cde_element_op)
+                        : element_op(cde_element_op)
+                    {
+                        // Take the incoming arguments, treat them as complex.
+
+                        // Allocate Real and Imaginary inputs
+                        auto elementsA
+                            = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides);
+                        auto elementsB
+                            = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides);
+                        elementsE
+                            = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides);
+
+                        mA_real.reset(nullptr);
+                        mA_imag.reset(nullptr);
+                        mB_real.reset(nullptr);
+                        mB_imag.reset(nullptr);
+                        mE_real.reset(nullptr);
+                        mE_imag.reset(nullptr);
+
+                        mE_grid       = p_e_grid;
+                        auto blockDim = dim3(1024);
+
+                        auto decompGrid = [blockDim](auto&       out_r,
+                                                     auto&       out_i,
+                                                     auto const* input_grid,
+                                                     uint32_t    elementCount) {
+                            using DecompT = typename std::decay_t<decltype(out_r)>::element_type;
+                            static_assert(std::is_same_v<
+                                              DecompT,
+                                              typename std::decay_t<decltype(out_i)>::element_type>,
+                                          "r and i buffers must be same type");
+
+                            if(input_grid != nullptr)
+                            {
+                                out_r = std::move(allocDevice<DecompT>(elementCount));
+                                out_i = std::move(allocDevice<DecompT>(elementCount));
+
+                                auto gridDim = dim3(ceilDiv(elementCount, blockDim.x));
+                                hiptensor::unpack<<<gridDim, blockDim, 0>>>(
+                                    input_grid, out_r.get(), out_i.get(), elementCount);
+                            }
+                        };
+
+                        // Decompose the incoming data from AOS->SOA
+                        decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA);
+                        decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB);
+                        decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE);
+
+                        auto allocScaleArgs = [a_ms_ks_lengths,
+                                               a_ms_ks_strides,
+                                               b_ns_ks_lengths,
+                                               b_ns_ks_strides,
+                                               e_ms_ns_lengths,
+                                               e_ms_ns_strides,
+                                               a_element_op,
+                                               b_element_op](auto&       out_e,
+                                                             auto const& in_a,
+                                                             auto const& in_b,
+                                                             auto const& cde_element_op) {
+                            return std::make_unique<ScaleDecompArgument>(
+                                in_a.get(),
+                                in_b.get(),
+                                std::array<void const*, 0>{},
+                                out_e.get(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                std::array<std::vector<index_t>, 0>{},
+                                std::array<std::vector<index_t>, 0>{},
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+                        };
+
+                        auto allocBilinearArgs = [a_ms_ks_lengths,
+                                                  a_ms_ks_strides,
+                                                  b_ns_ks_lengths,
+                                                  b_ns_ks_strides,
+                                                  e_ms_ns_lengths,
+                                                  e_ms_ns_strides,
+                                                  a_element_op,
+                                                  b_element_op](auto&       out_e,
+                                                                auto const& in_a,
+                                                                auto const& in_b,
+                                                                auto const& in_d,
+                                                                auto const& cde_element_op) {
+                            return std::make_unique<BilinearDecompArgument>(
+                                in_a.get(),
+                                in_b.get(),
+                                std::array<void const*, 1>{in_d.get()},
+                                out_e.get(),
+                                a_ms_ks_lengths,
+                                a_ms_ks_strides,
+                                b_ns_ks_lengths,
+                                b_ns_ks_strides,
+                                std::array<std::vector<index_t>, 1>{e_ms_ns_lengths},
+                                std::array<std::vector<index_t>, 1>{e_ms_ns_strides},
+                                e_ms_ns_lengths,
+                                e_ms_ns_strides,
+                                a_element_op,
+                                b_element_op,
+                                cde_element_op);
+                        };
+
+                        mScaleArgs[0] = allocScaleArgs(
+                            mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[0]
+                            = allocBilinearArgs(mE_real,
+                                                mA_imag,
+                                                mB_imag,
+                                                mE_real,
+                                                DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f});
+
+                        mScaleArgs[1] = allocScaleArgs(
+                            mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f});
+                        mBilinearArgs[1]
+                            = allocBilinearArgs(mE_imag,
+                                                mA_imag,
+                                                mB_real,
+                                                mE_imag,
+                                                DecompBilinearCDEElementwiseOperation{1.0f, 1.0f});
+                    }
+
+                    void Print() const
+                    {
+                        std::cout << "ScaleArgs0:" << std::endl;
+                        mScaleArgs[0]->Print();
+                        std::cout << "ScaleArgs1:" << std::endl;
+                        mScaleArgs[1]->Print();
+                        std::cout << "BilinearArgs0:" << std::endl;
+                        mBilinearArgs[0]->Print();
+                        std::cout << "BilinearArgs1:" << std::endl;
+                        mBilinearArgs[1]->Print();
+                    }
+
+                    //  private:
+                    // Each argument set for complex:
+                    std::unique_ptr<ScaleDecompArgument>    mScaleArgs[2];
+                    std::unique_ptr<BilinearDecompArgument> mBilinearArgs[2];
+
+                    template <typename DataT>
+                    using DeviceArray = std::unique_ptr<DataT, DeviceDeleter>;
+
+                    // Manage extra memory for AOS->SOA
+                    DeviceArray<DecompA> mA_real;
+                    DeviceArray<DecompA> mA_imag;
+                    DeviceArray<DecompB> mB_real;
+                    DeviceArray<DecompB> mB_imag;
+                    DeviceArray<DecompE> mE_real;
+                    DeviceArray<DecompE> mE_imag;
+
+                    ScaleCDEElementwiseOperation element_op;
+                    void*                        mE_grid;
+                    index_t                      elementsE;
+                };
+
+                // Invoker
+                struct Invoker : public BaseInvoker
+                {
+                    using Argument = typename DeviceOp::Argument;
+
+                    Invoker()
+                        : mScaleInvoker(std::make_unique<typename ScaleDecompOp::Invoker>())
+                        , mBilinearInvoker(std::make_unique<typename BilinearDecompOp::Invoker>())
+                    {
+                    }
+
+                    Invoker(Invoker&& other)
+                        : mScaleInvoker(std::move(other.mScaleInvoker))
+                        , mBilinearInvoker(std::move(other.mBilinearInvoker))
+                    {
+                    }
+
+                    Invoker& operator=(Invoker&& other)
+                    {
+                        if(this != &other)
+                        {
+                            mScaleInvoker    = std::move(other.mScaleInvoker);
+                            mBilinearInvoker = std::move(other.mBilinearInvoker);
+                        }
+                        return *this;
+                    }
+
+                    float Run(const Argument&     arg,
+                              const StreamConfig& stream_config = StreamConfig{})
+                    {
+                        auto r0 = mScaleInvoker->Run(arg.mScaleArgs[0].get(), stream_config);
+                        auto r1 = mScaleInvoker->Run(arg.mScaleArgs[1].get(), stream_config);
+                        auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config);
+                        auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config);
+
+                        if(arg.mE_grid != nullptr)
+                        {
+                            auto blockDim = dim3(1024);
+                            auto gridDim  = dim3(ceilDiv(arg.elementsE, blockDim.x));
+
+                            hiptensor::multiply<<<gridDim, blockDim, 0>>>(arg.mE_real.get(),
+                                                                          arg.mE_imag.get(),
+                                                                          ((ComplexE*)arg.mE_grid),
+                                                                          arg.element_op.scale_,
+                                                                          arg.elementsE);
+                        }
+
+                        return r0 + r1 + r2 + r3;
+                    }
+
+                    // polymorphic
+                    float Run(const BaseArgument* p_arg,
+                              const StreamConfig& stream_config = StreamConfig{}) override
+                    {
+                        return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+                    }
+
+                    std::unique_ptr<typename ScaleDecompOp::Invoker>    mScaleInvoker;
+                    std::unique_ptr<typename BilinearDecompOp::Invoker> mBilinearInvoker;
+                };
+
+                static bool IsSupportedArgument(const Argument& arg)
+                {
+                    return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[0].get()))
+                           && ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[1].get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get()))
+                           && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get()));
+                }
+
+                // polymorphic
+                bool IsSupportedArgument(const BaseArgument* p_arg) override
+                {
+                    return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+                }
+
+                // polymorphic
+                virtual void SetWorkSpacePointer(BaseArgument*       p_arg,
+                                                 void*               p_workspace,
+                                                 StreamConfig const& s
+                                                 = StreamConfig{}) const override
+                {
+                    // Call the base, then fwd to each arg.
+                    this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s);
+                    auto* arg = dynamic_cast<Argument*>(p_arg);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mScaleArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mScaleArgs[1].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[0].get(), p_workspace, s);
+                    this->BaseOperator::SetWorkSpacePointer(
+                        arg->mBilinearArgs[1].get(), p_workspace, s);
+                }
+
+                static auto MakeArgument(
+                    const void*                                         p_a,
+                    const void*                                         p_b,
+                    std::array<const void*, NumDTensor>                 p_ds,
+                    void*                                               p_e,
+                    const std::vector<index_t>&                         a_ms_ks_lengths,
+                    const std::vector<index_t>&                         a_ms_ks_strides,
+                    const std::vector<index_t>&                         b_ns_ks_lengths,
+                    const std::vector<index_t>&                         b_ns_ks_strides,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                    const std::vector<index_t>&                         e_ms_ns_lengths,
+                    const std::vector<index_t>&                         e_ms_ns_strides,
+                    AElementwiseOperation                               a_element_op,
+                    BElementwiseOperation                               b_element_op,
+                    ScaleCDEElementwiseOperation                        cde_element_op)
+                {
+                    return Argument{p_a,
+                                    p_b,
+                                    p_ds,
+                                    p_e,
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    ds_ms_ns_lengths,
+                                    ds_ms_ns_strides,
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op};
+                }
+
+                static auto MakeInvoker()
+                {
+                    return Invoker{};
+                }
+
+                // polymorphic
+                std::unique_ptr<BaseArgument> MakeArgumentPointer(
+                    const void*                                         p_a,
+                    const void*                                         p_b,
+                    std::array<const void*, NumDTensor>                 p_ds,
+                    void*                                               p_e,
+                    const std::vector<index_t>&                         a_ms_ks_lengths,
+                    const std::vector<index_t>&                         a_ms_ks_strides,
+                    const std::vector<index_t>&                         b_ns_ks_lengths,
+                    const std::vector<index_t>&                         b_ns_ks_strides,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                    const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                    const std::vector<index_t>&                         e_ms_ns_lengths,
+                    const std::vector<index_t>&                         e_ms_ns_strides,
+                    AElementwiseOperation                               a_element_op,
+                    BElementwiseOperation                               b_element_op,
+                    ScaleCDEElementwiseOperation                        cde_element_op) override
+                {
+                    return std::make_unique<Argument>(p_a,
+                                                      p_b,
+                                                      p_ds,
+                                                      p_e,
+                                                      a_ms_ks_lengths,
+                                                      a_ms_ks_strides,
+                                                      b_ns_ks_lengths,
+                                                      b_ns_ks_strides,
+                                                      ds_ms_ns_lengths,
+                                                      ds_ms_ns_strides,
+                                                      e_ms_ns_lengths,
+                                                      e_ms_ns_strides,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      cde_element_op);
+                }
+
+                // polymorphic
+                std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+                {
+                    return std::make_unique<Invoker>(Invoker{});
+                }
+
+                // polymorphic
+                std::string GetTypeString() const override
+                {
+                    auto str = std::stringstream();
+
+                    // clang-format off
+        str << "DeviceContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+                    // clang-format on
+
+                    return str.str();
+                }
+            };
+
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
+
+#endif // HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..1da8301f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance
+                    = device_contraction_kk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..82c17500
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance
+                    = device_contraction_kn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..1febb560
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance
+                    = device_contraction_mk_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..02b9d719
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance
+                    = device_contraction_mn_instance<BF16,
+                                                     BF16,
+                                                     F32,
+                                                     BF16,
+                                                     Empty_Tuple,
+                                                     BF16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               BF16,
+                                                                               BF16,
+                                                                               Empty_Tuple,
+                                                                               BF16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
new file mode 100644
index 00000000..3133f4cd
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp
@@ -0,0 +1,92 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather
+// than using default setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter
+// of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                using F32           = float;
+                using CF32          = hipFloatComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance
+                    = device_contraction_kk_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     CF32,
+                                                     CF32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     ScaleComplex>;
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances)
+                    {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
+
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
new file mode 100644
index 00000000..b358be8a
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32           = float;
+                using CF32          = hipFloatComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance
+                    = device_contraction_kn_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     CF32,
+                                                     CF32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     ScaleComplex>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+   } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
new file mode 100644
index 00000000..359a074a
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32           = float;
+                using CF32          = hipFloatComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance
+                    = device_contraction_mk_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     CF32,
+                                                     CF32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     ScaleComplex>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
new file mode 100644
index 00000000..4cc8659d
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F32           = float;
+                using CF32          = hipFloatComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance
+                    = device_contraction_mn_instance<CF32,
+                                                     CF32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     CF32,
+                                                     CF32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     ScaleComplex>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
new file mode 100644
index 00000000..1cac8ebb
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp
@@ -0,0 +1,92 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather
+// than using default setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter
+// of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                using F64           = double;
+                using CF64          = hipDoubleComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance
+                    = device_contraction_f64_kk_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         CF64,
+                                                         CF64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         ScaleComplex>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance{});
+                }
+
+           } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
new file mode 100644
index 00000000..e60bbd61
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64           = double;
+                using CF64          = hipDoubleComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance
+                    = device_contraction_f64_kn_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         CF64,
+                                                         CF64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         ScaleComplex>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                       device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
new file mode 100644
index 00000000..e44d24e1
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64           = double;
+                using CF64          = hipDoubleComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance
+                    = device_contraction_f64_mk_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         CF64,
+                                                         CF64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         ScaleComplex>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
new file mode 100644
index 00000000..dee9ce39
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include "common.hpp"
+#include "device_contraction_scale_complex.hpp"
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+                using F64           = double;
+                using CF64          = hipDoubleComplex;
+                using Empty_Tuple   = ck::Tuple<>;
+                using ScaleComplex  = element_wise::ScaleComplex;
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance
+                    = device_contraction_f64_mn_instance<CF64,
+                                                         CF64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         CF64,
+                                                         CF64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         ScaleComplex>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance{});
+                }
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..5917e466
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance
+                    = device_contraction_kk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..216f470e
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance
+                    = device_contraction_kn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..3401b605
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance
+                    = device_contraction_mk_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..fe2fa97d
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance
+                    = device_contraction_mn_instance<F16,
+                                                     F16,
+                                                     F32,
+                                                     F16,
+                                                     Empty_Tuple,
+                                                     F16,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F16,
+                                                                               F16,
+                                                                               Empty_Tuple,
+                                                                               F16,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
new file mode 100644
index 00000000..9a104075
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
new file mode 100644
index 00000000..6a7f565f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
new file mode 100644
index 00000000..094655bb
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
new file mode 100644
index 00000000..583b5b00
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     BF16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               BF16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
new file mode 100644
index 00000000..8eec79cf
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
new file mode 100644
index 00000000..a8999be8
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
new file mode 100644
index 00000000..e4e4b7de
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
new file mode 100644
index 00000000..a641f6e3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F16,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F32,
+                                                                               F32,
+                                                                               Empty_Tuple,
+                                                                               F32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F16>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
index 88345e74..24d2d570 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,42 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // k/k/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -88,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
index 38702afd..f559dc06 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // k/n/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_kn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
index 735a5e34..a522052d 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // m/k/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mk_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
index d286e2d8..be35683b 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,45 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F32         = float;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
-                // m/n/n are the fast changing dimension for A/B/E
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
-                        // clang-format on
-                        >;
+                    = device_contraction_mn_instance<F32,
+                                                     F32,
+                                                     F32,
+                                                     F32,
+                                                     Empty_Tuple,
+                                                     F32,
+                                                     F32,
+                                                     PassThrough,
+                                                     PassThrough,
+                                                     Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -91,7 +70,8 @@ namespace ck
                                                                            F32,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F32>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
new file mode 100644
index 00000000..dac46620
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
new file mode 100644
index 00000000..0830b49f
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // k/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
new file mode 100644
index 00000000..9a716ba3
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/k/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
new file mode 100644
index 00000000..e02ac144
--- /dev/null
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck
+{
+    namespace tensor_operation
+    {
+        namespace device
+        {
+            namespace instance
+            {
+
+                // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+                // m/n/n/n are the fast changing dimension for A/B/D/E
+                using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F32,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F32,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               F64,
+                                                                               F64,
+                                                                               Empty_Tuple,
+                                                                               F64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               Scale,
+                                                                               F32>>>& instances)
+                {
+                    add_device_operation_instances(
+                        instances,
+                        device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{});
+                }
+
+            } // namespace instance
+        } // namespace device
+    } // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
index f8904a8f..6f168ee2 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    32,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    32,    16,   2,   2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    32,    64,    16,   2,   2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,              S<1,  8, 1,  8>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
index 56fc8b91..347a810c 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // k/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   1,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   1,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_kn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
index 231a0256..229d18c7 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/k/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   2,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   2,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   2,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mk_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
index 4fc648d4..bf1efa14 100644
--- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
@@ -24,13 +24,18 @@
  *
  *******************************************************************************/
 
-// This (ifndef) is a hack to use customized behavior for buffer load rather
-// than using default setting Don't use this hack unless absolutely necessary!
-// FIXME: make the behavior of buffer load a configurable (template) parameter
-// of each device op
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
 
-#include "common.hpp"
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 namespace ck
 {
@@ -41,39 +46,19 @@ namespace ck
             namespace instance
             {
 
-                using F64         = double;
-                using Empty_Tuple = ck::Tuple<>;
-
-                template <ck::index_t... Is>
-                using S = ck::Sequence<Is...>;
-
-                using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-                using Scale       = ck::tensor_operation::element_wise::Scale;
-
-                static constexpr auto GemmMNKPadding
-                    = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
                 // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
                 // m/n/n/n are the fast changing dimension for A/B/D/E
                 using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance
-                    = std::tuple<
-                        // clang-format off
-        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1,  8>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   16,   16,    4,    4,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   2,   2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1,  8, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   16,   16,    4,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   2,   2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   16,   16,    2,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         0,           1,           1,              S<1, 16, 1, 16>,               1>,
-        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F64,   F64,     F64,      F64, Empty_Tuple,   F64,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   2,   2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              1,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              1,         1,           1,           1,              S<1, 16, 1, 16>,               1>
-                        // clang-format on
-                        >;
+                    = device_contraction_f64_mn_instance<F64,
+                                                         F64,
+                                                         F64,
+                                                         F64,
+                                                         Empty_Tuple,
+                                                         F64,
+                                                         F64,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale>;
 
                 void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
                     std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
@@ -85,7 +70,8 @@ namespace ck
                                                                            F64,
                                                                            PassThrough,
                                                                            PassThrough,
-                                                                           Scale>>>& instances)
+                                                                           Scale,
+                                                                           F64>>>& instances)
                 {
                     add_device_operation_instances(
                         instances,
diff --git a/library/src/contraction/device/device_element_wise_operation_complex.hpp b/library/src/contraction/device/device_element_wise_operation_complex.hpp
new file mode 100644
index 00000000..a01ced36
--- /dev/null
+++ b/library/src/contraction/device/device_element_wise_operation_complex.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP
+#define HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP
+
+#include <unary_element_wise_operation.hpp>
+#include <binary_element_wise_operation.hpp>
+#include <hip/hip_complex.h>
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+
+struct ScaleComplex : public Scale
+{
+    __host__ __device__ ScaleComplex(hipDoubleComplex scale) : Scale(hipCreal(scale))
+    {
+        scale_ = scale;
+    }
+
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<hipFloatComplex, hipFloatComplex>(hipFloatComplex& y, const hipFloatComplex& x) const
+    {
+        y = hipCmulf(hipComplexDoubleToFloat(scale_), x);
+    };
+
+    template <>
+    __host__ __device__ void operator()<hipDoubleComplex, hipDoubleComplex>(hipDoubleComplex& y, const hipDoubleComplex& x) const
+    {
+        y = hipCmul(scale_, x);
+    };
+
+    // complex * float
+    hipDoubleComplex scale_;
+};
+
+struct BilinearComplex : public Bilinear
+{
+    BilinearComplex(hipDoubleComplex alpha, hipDoubleComplex beta) : Bilinear(hipCreal(alpha), hipCreal(beta))
+    {
+        alpha_  = alpha;
+        beta_   = beta;
+    }
+
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<hipDoubleComplex, hipDoubleComplex, hipDoubleComplex>(hipDoubleComplex& y, const hipDoubleComplex& x0, const hipDoubleComplex& x1) const
+    {
+        y = hipCadd(hipCmul(alpha_, x0), hipCmul(beta_, x1));
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<hipFloatComplex, hipFloatComplex, hipFloatComplex>(hipFloatComplex& y, const hipFloatComplex& x0, const hipFloatComplex& x1) const
+    {
+        y = hipCaddf(hipCmulf(hipComplexDoubleToFloat(alpha_), x0), hipCmulf(hipComplexDoubleToFloat(beta_), x1));
+    };
+
+    hipDoubleComplex alpha_;
+    hipDoubleComplex beta_;
+};
+
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+
+#endif // HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP
diff --git a/library/src/contraction/device/contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
similarity index 73%
rename from library/src/contraction/device/contraction_bilinear.hpp
rename to library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
index e8f73b58..81d7edf5 100644
--- a/library/src/contraction/device/contraction_bilinear.hpp
+++ b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp
@@ -37,120 +37,128 @@ namespace ck
         {
             namespace instance
             {
+                using F32        = float;
+                using CF32       = hipFloatComplex;
+                using CF32_Tuple = ck::Tuple<CF32>;
+
+                using F64        = double;
+                using CF64       = hipDoubleComplex;
+                using CF64_Tuple = ck::Tuple<CF64>;
+
+                using BilinearComplex = element_wise::BilinearComplex;
 
-                // float
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F32,
-                                                                               F32,
-                                                                               F32_Tuple,
-                                                                               F32,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F32,
-                                                                               F32,
-                                                                               F32_Tuple,
-                                                                               F32,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F32,
-                                                                               F32,
-                                                                               F32_Tuple,
-                                                                               F32,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F32,
-                                                                               F32,
-                                                                               F32_Tuple,
-                                                                               F32,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               CF32_Tuple,
+                                                                               CF32,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               BilinearComplex,
+                                                                               CF32>>>& instances);
 
                 // double
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F64,
-                                                                               F64,
-                                                                               F64_Tuple,
-                                                                               F64,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F64,
-                                                                               F64,
-                                                                               F64_Tuple,
-                                                                               F64,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F64,
-                                                                               F64,
-                                                                               F64_Tuple,
-                                                                               F64,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances);
 
                 void
-                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                    add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance(
                         std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
                                                                                2,
                                                                                2,
-                                                                               F64,
-                                                                               F64,
-                                                                               F64_Tuple,
-                                                                               F64,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               CF64_Tuple,
+                                                                               CF64,
                                                                                PassThrough,
                                                                                PassThrough,
-                                                                               Bilinear>>>&
-                            instances);
+                                                                               BilinearComplex,
+                                                                               CF64>>>& instances);
 
                 // Contraction + Bilinear
                 template <index_t NumDimM,
@@ -158,66 +166,69 @@ namespace ck
                           index_t NumDimK,
                           typename ADataType,
                           typename BDataType,
-                          typename DDataType,
-                          typename EDataType>
+                          typename DsDataType,
+                          typename EDataType,
+                          typename ComputeDataT>
                 struct DeviceOperationInstanceFactory<
                     ck::tensor_operation::device::DeviceContractionMultipleD<
                         NumDimM,
                         NumDimN,
                         NumDimK,
-                        ADataType,
-                        BDataType,
-                        ck::Tuple<DDataType>,
-                        EDataType,
+                        HIP_vector_type<ADataType, 2>,
+                        HIP_vector_type<BDataType, 2>,
+                        ck::Tuple<HIP_vector_type<DsDataType, 2>>,
+                        HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Bilinear>>
+                        ck::tensor_operation::element_wise::BilinearComplex,
+                        HIP_vector_type<ComputeDataT, 2>>>
                 {
                     using DeviceOp = DeviceContractionMultipleD<
                         NumDimM,
                         NumDimN,
                         NumDimK,
-                        ADataType,
-                        BDataType,
-                        ck::Tuple<DDataType>,
-                        EDataType,
+                        HIP_vector_type<ADataType, 2>,
+                        HIP_vector_type<BDataType, 2>,
+                        ck::Tuple<HIP_vector_type<DsDataType, 2>>,
+                        HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Bilinear>;
+                        ck::tensor_operation::element_wise::BilinearComplex,
+                        HIP_vector_type<ComputeDataT, 2>>;
 
                     static auto GetInstances()
                     {
                         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 
                         if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float>
-                                     && is_same_v<DDataType, float> && is_same_v<EDataType, float>)
+                                     && is_same_v<DsDataType, float> && is_same_v<EDataType, float>)
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance(
                                     op_ptrs);
                             }
                         }
 
                         if constexpr(is_same_v<ADataType, double> && is_same_v<BDataType, double>
-                                     && is_same_v<DDataType, double>
+                                     && is_same_v<DsDataType, double>
                                      && is_same_v<EDataType, double>)
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance(
                                     op_ptrs);
-                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance(
+                                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance(
                                     op_ptrs);
                             }
                         }
diff --git a/library/src/contraction/device/contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
similarity index 62%
rename from library/src/contraction/device/contraction_scale.hpp
rename to library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
index 916f79de..705ac6c0 100644
--- a/library/src/contraction/device/contraction_scale.hpp
+++ b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp
@@ -37,136 +37,161 @@ namespace ck
         {
             namespace instance
             {
-
-                // float
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F32,
-                                                                           F32,
-                                                                           Empty_Tuple,
-                                                                           F32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F32,
-                                                                           F32,
-                                                                           Empty_Tuple,
-                                                                           F32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F32,
-                                                                           F32,
-                                                                           Empty_Tuple,
-                                                                           F32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F32,
-                                                                           F32,
-                                                                           Empty_Tuple,
-                                                                           F32,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                // double
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F64,
-                                                                           F64,
-                                                                           Empty_Tuple,
-                                                                           F64,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F64,
-                                                                           F64,
-                                                                           Empty_Tuple,
-                                                                           F64,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F64,
-                                                                           F64,
-                                                                           Empty_Tuple,
-                                                                           F64,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
-                void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
-                    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
-                                                                           2,
-                                                                           2,
-                                                                           F64,
-                                                                           F64,
-                                                                           Empty_Tuple,
-                                                                           F64,
-                                                                           PassThrough,
-                                                                           PassThrough,
-                                                                           Scale>>>& instances);
-
+                using F32         = float;
+                using CF32        = hipFloatComplex;
+                using Empty_Tuple = ck::Tuple<>;
+
+                using F64  = double;
+                using CF64 = hipDoubleComplex;
+
+                using ScaleComplex = element_wise::ScaleComplex;
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF32,
+                                                                               CF32,
+                                                                               Empty_Tuple,
+                                                                               CF32,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF32>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances);
+
+                void
+                    add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance(
+                        std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                                               2,
+                                                                               2,
+                                                                               CF64,
+                                                                               CF64,
+                                                                               Empty_Tuple,
+                                                                               CF64,
+                                                                               PassThrough,
+                                                                               PassThrough,
+                                                                               ScaleComplex,
+                                                                               CF64>>>& instances);
+ 
                 // Contraction + Scale
                 template <index_t NumDimM,
                           index_t NumDimN,
                           index_t NumDimK,
                           typename ADataType,
                           typename BDataType,
-                          typename EDataType>
-                struct HipTensorDeviceOperationInstanceFactory<
+                          typename EDataType,
+                          typename ComputeDataType>
+                struct DeviceOperationInstanceFactory<
                     ck::tensor_operation::device::DeviceContractionMultipleD<
                         NumDimM,
                         NumDimN,
                         NumDimK,
-                        ADataType,
-                        BDataType,
+                        HIP_vector_type<ADataType, 2>,
+                        HIP_vector_type<BDataType, 2>,
                         ck::Tuple<>,
-                        EDataType,
+                        HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Scale>>
+                        ck::tensor_operation::element_wise::ScaleComplex,
+                        HIP_vector_type<ComputeDataType, 2>>>
                 {
                     using DeviceOp = DeviceContractionMultipleD<
                         NumDimM,
                         NumDimN,
                         NumDimK,
-                        ADataType,
-                        BDataType,
+                        HIP_vector_type<ADataType, 2>,
+                        HIP_vector_type<BDataType, 2>,
                         ck::Tuple<>,
-                        EDataType,
+                        HIP_vector_type<EDataType, 2>,
                         ck::tensor_operation::element_wise::PassThrough,
                         ck::tensor_operation::element_wise::PassThrough,
-                        ck::tensor_operation::element_wise::Scale>;
+                        ck::tensor_operation::element_wise::ScaleComplex,
+                        HIP_vector_type<ComputeDataType, 2>>;
 
                     static auto GetInstances()
                     {
@@ -177,13 +202,13 @@ namespace ck
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance(
                                     op_ptrs);
                             }
                         }
@@ -193,13 +218,13 @@ namespace ck
                         {
                             if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
                             {
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance(
                                     op_ptrs);
-                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance(
+                                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance(
                                     op_ptrs);
                             }
                         }
diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp
index 09f5ddf6..eb7d8919 100644
--- a/library/src/contraction/hiptensor_contraction.cpp
+++ b/library/src/contraction/hiptensor_contraction.cpp
@@ -147,7 +147,11 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t*
         // Use a scale contraction due to
         // tensor C-descriptor is empty
 
-        *desc = {(int32_t)hiptensor::ContractionOpId_t::SCALE,
+        auto contractionOp
+            = typeCompute == HIPTENSOR_COMPUTE_C32F || typeCompute == HIPTENSOR_COMPUTE_C64F
+                  ? hiptensor::ContractionOpId_t::SCALE_COMPLEX
+                  : hiptensor::ContractionOpId_t::SCALE;
+        *desc = {(int32_t)contractionOp,
                  typeCompute,
                  {*descA,
                   *descB,
@@ -161,7 +165,11 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t*
     {
         // Use a bilinear contraction due to
         // tensor C-descriptor is not empty
-        *desc = {(int32_t)hiptensor::ContractionOpId_t::BILINEAR,
+        auto contractionOp
+            = typeCompute == HIPTENSOR_COMPUTE_C32F || typeCompute == HIPTENSOR_COMPUTE_C64F
+                  ? hiptensor::ContractionOpId_t::BILINEAR_COMPLEX
+                  : hiptensor::ContractionOpId_t::BILINEAR;
+        *desc = {(int32_t)contractionOp,
                  typeCompute,
                  {*descA, *descB, *descC, *descD},
                  {alignmentRequirementA,
@@ -242,17 +250,6 @@ hiptensorStatus_t hiptensorInitContractionFind(const hiptensorHandle_t*    handl
         auto& instances = hiptensor::ContractionSolutionInstances::instance();
         auto  solnQ     = instances->allSolutions();
 
-        // Check if the current device supports F64
-        if(!currentDevice.supportsF64())
-        {
-            // Allow only supported f32 combos
-            solnQ = solnQ.query(HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F) || // Bilinear F32
-                    solnQ.query(HIP_R_32F,
-                                HIP_R_32F,
-                                hipDataType(hiptensor::NONE_TYPE),
-                                HIP_R_32F); // Scale F32 (no C)
-        }
-
         // Can do more checking for scale / bilinear, etc. if we need to.
 
         if(solnQ.solutionCount() == 0)
@@ -461,15 +458,16 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
     // Convert to concrete contraction solutions
     auto candidates = toContractionSolutionVec(find->mCandidates);
 
-    auto ADataType = desc->mTensorDesc[0].mType;
-    auto BDataType = desc->mTensorDesc[1].mType;
-    auto DDataType = desc->mTensorDesc[2].mType;
-    auto EDataType = desc->mTensorDesc[3].mType;
+    auto computeType = desc->mComputeType;
+    auto ADataType   = desc->mTensorDesc[0].mType;
+    auto BDataType   = desc->mTensorDesc[1].mType;
+    auto DDataType   = desc->mTensorDesc[2].mType;
+    auto EDataType   = desc->mTensorDesc[3].mType;
 
     // Query contraction solutions for the correct contraction operation and type
     auto solutionQ = hiptensor::ContractionSolutionRegistry::Query{candidates}
                          .query((hiptensor::ContractionOpId_t)desc->mContractionOpId)
-                         .query(ADataType, BDataType, DDataType, EDataType);
+                         .query(ADataType, BDataType, DDataType, EDataType, computeType);
 
     candidates = toContractionSolutionVec(solutionQ.solutions());
 
@@ -500,6 +498,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
                                             EDataType,
                                             desc->mTensorDesc[3].mLengths,
                                             desc->mTensorDesc[3].mStrides,
+                                            desc->mComputeType,
                                             workspaceSize);
     }
     else if(find->mSelectionAlgorithm == HIPTENSOR_ALGO_ACTOR_CRITIC)
@@ -518,6 +517,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t*
                                              EDataType,
                                              desc->mTensorDesc[3].mLengths,
                                              desc->mTensorDesc[3].mStrides,
+                                             desc->mComputeType,
                                              workspaceSize);
     }
 
@@ -582,18 +582,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         }
         else
         {
-            if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F)
-            {
-                snprintf(
-                    alphaMsg, sizeof(alphaMsg), "alpha=%.6f", *(static_cast<const float*>(alpha)));
-            }
-            else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F)
-            {
-                snprintf(alphaMsg,
-                         sizeof(alphaMsg),
-                         "alpha=%.6lf",
-                         *(static_cast<const double*>(alpha)));
-            }
+            auto alphaValue = hiptensor::readVal<hiptensor::ScalarData>(
+                alpha, plan->mContractionDesc.mComputeType);
+            snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%s", std::to_string(alphaValue).c_str());
         }
 
         if(beta == nullptr)
@@ -602,15 +593,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         }
         else
         {
-            if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F)
-            {
-                snprintf(betaMsg, sizeof(betaMsg), "beta=%.6f", *(static_cast<const float*>(beta)));
-            }
-            else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F)
-            {
-                snprintf(
-                    betaMsg, sizeof(betaMsg), "beta=%.6lf", *(static_cast<const double*>(beta)));
-            }
+            auto betaValue = hiptensor::readVal<hiptensor::ScalarData>(
+                beta, plan->mContractionDesc.mComputeType);
+            snprintf(betaMsg, sizeof(betaMsg), "beta=%s", std::to_string(betaValue).c_str());
         }
     }
     else
@@ -708,17 +693,6 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         return errorCode;
     }
 
-    if(plan->mContractionDesc.mComputeType != plan->mContractionDesc.mTensorDesc[3].mType)
-    {
-        auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
-        snprintf(msg,
-                 sizeof(msg),
-                 "Internal Error : compute type != D type (%s)",
-                 hiptensorGetErrorString(errorCode));
-        logger->logError("hiptensorContraction", msg);
-        return errorCode;
-    }
-
     auto* cSolution = (hiptensor::ContractionSolution*)(plan->mSolution);
 
     auto canRun = cSolution->initArgs(alpha,
@@ -755,7 +729,17 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         // Perform contraction with timing if LOG_LEVEL_PERF_TRACE
         if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE)
         {
-            auto time = (*cSolution)(StreamConfig{stream, true});
+            auto time = (*cSolution)(StreamConfig{
+                stream, // stream id
+                true, // time_kernel
+                0, // log_level
+                0, // cold_niters
+                1, // nrepeat
+            });
+            if(time < 0)
+            {
+                return HIPTENSOR_STATUS_CK_ERROR;
+            }
 
             int32_t m, n, k;
             std::tie(m, n, k) = cSolution->problemDims();
@@ -784,7 +768,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t*          handle,
         // Perform contraction without timing
         else
         {
-            (*cSolution)(StreamConfig{stream, false});
+            if((*cSolution)(StreamConfig{stream, false}) < 0)
+            {
+                return HIPTENSOR_STATUS_CK_ERROR;
+            }
         }
 
         return HIPTENSOR_STATUS_SUCCESS;
diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp
index b270973d..5a31a91f 100644
--- a/library/src/data_types.cpp
+++ b/library/src/data_types.cpp
@@ -79,6 +79,14 @@ namespace hiptensor
         {
             return sizeof(uint64_t);
         }
+        else if(id == HIP_C_32F)
+        {
+            return sizeof(hipFloatComplex);
+        }
+        else if(id == HIP_C_64F)
+        {
+            return sizeof(hipDoubleComplex);
+        }
         else if(id == NONE_TYPE)
         {
             return 0;
@@ -126,12 +134,124 @@ namespace hiptensor
         {
             return HIPTENSOR_COMPUTE_32U;
         }
+        else if(hipType == HIP_C_32F)
+        {
+            return HIPTENSOR_COMPUTE_C32F;
+        }
+        else if(hipType == HIP_C_64F)
+        {
+            return HIPTENSOR_COMPUTE_C64F;
+        }
         else
         {
             return HIPTENSOR_COMPUTE_NONE;
         }
     }
 
+    template <>
+    ScalarData readVal(void const* value, hiptensorComputeType_t id)
+    {
+        if(id == HIPTENSOR_COMPUTE_16F)
+        {
+            return ScalarData(id, *(_Float16*)value);
+        }
+        else if(id == HIPTENSOR_COMPUTE_16BF)
+        {
+            return ScalarData(id, *(hip_bfloat16*)value);
+        }
+        else if(id == HIPTENSOR_COMPUTE_32F)
+        {
+            return ScalarData(id, *(float*)value);
+        }
+        else if(id == HIPTENSOR_COMPUTE_64F)
+        {
+            return ScalarData(id, *(double*)value);
+        }
+        else if(id == HIPTENSOR_COMPUTE_8U)
+        {
+            return ScalarData(id, *(uint8_t*)value);
+        }
+        else if(id == HIPTENSOR_COMPUTE_8I)
+        {
+            return ScalarData(id, *(int8_t*)value);
+        }
+        else if(id == HIPTENSOR_COMPUTE_32U)
+        {
+            return ScalarData(id, *(uint32_t*)value);
+        }
+        else if(id == HIPTENSOR_COMPUTE_32I)
+        {
+            return ScalarData(id, *(int32_t*)value);
+        }
+        else if(id == HIPTENSOR_COMPUTE_C32F)
+        {
+            auto complex = *(hipFloatComplex*)value;
+            return {id, complex.x, complex.y};
+        }
+        else if(id == HIPTENSOR_COMPUTE_C64F)
+        {
+            auto complex = *(hipDoubleComplex*)value;
+            return {id, complex.x, complex.y};
+        }
+        else
+        {
+#if !NDEBUG
+            std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
+#endif // !NDEBUG
+            return {HIPTENSOR_COMPUTE_NONE, 0, 0};
+        }
+    }
+
+    void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value)
+    {
+        if(id == HIPTENSOR_COMPUTE_16F)
+        {
+            *(_Float16*)addr = value.mReal;
+        }
+        else if(id == HIPTENSOR_COMPUTE_16BF)
+        {
+            *(hip_bfloat16*)addr = value.mReal;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32F)
+        {
+            *(float*)addr = value.mReal;
+        }
+        else if(id == HIPTENSOR_COMPUTE_64F)
+        {
+            *(double*)addr = value.mReal;
+        }
+        else if(id == HIPTENSOR_COMPUTE_8U)
+        {
+            *(uint8_t*)addr = (uint8_t)value.mReal;
+        }
+        else if(id == HIPTENSOR_COMPUTE_8I)
+        {
+            *(int8_t*)addr = (int8_t)value.mReal;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32U)
+        {
+            *(uint32_t*)addr = (uint32_t)value.mReal;
+        }
+        else if(id == HIPTENSOR_COMPUTE_32I)
+        {
+            *(int32_t*)addr = (int32_t)value.mReal;
+        }
+        else if(id == HIPTENSOR_COMPUTE_C32F)
+        {
+            *(hipFloatComplex*)addr = hipComplexDoubleToFloat(value.mComplex);
+        }
+        else if(id == HIPTENSOR_COMPUTE_C64F)
+        {
+            *(hipDoubleComplex*)addr = value.mComplex;
+        }
+        else
+        {
+#if !NDEBUG
+            std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
+#endif // !NDEBUG
+            return;
+        }
+    }
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType)
@@ -144,11 +264,11 @@ bool operator==(hipDataType hipType, hiptensorComputeType_t computeType)
     {
         return (computeType == HIPTENSOR_COMPUTE_16F);
     }
-    else if(hipType == HIP_R_32F)
+    else if(hipType == HIP_R_32F || hipType == HIP_C_32F)
     {
         return (computeType == HIPTENSOR_COMPUTE_32F);
     }
-    else if(hipType == HIP_R_64F)
+    else if(hipType == HIP_R_64F || hipType == HIP_C_64F)
     {
         return (computeType == HIPTENSOR_COMPUTE_64F);
     }
@@ -207,3 +327,19 @@ bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType)
 {
     return !(computeType == hipType);
 }
+
+namespace std
+{
+    std::string to_string(const hiptensor::ScalarData& value)
+    {
+        if(value.mType == HIPTENSOR_COMPUTE_C32F || value.mType == HIPTENSOR_COMPUTE_C64F)
+        {
+            return string() + "[" + to_string(value.mComplex.x) + ", " + to_string(value.mComplex.y)
+                   + "]";
+        }
+        else
+        {
+            return to_string(value.mReal);
+        }
+    }
+}
diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp
index 9740d2a8..8d185758 100644
--- a/library/src/hiptensor.cpp
+++ b/library/src/hiptensor.cpp
@@ -152,7 +152,9 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t*     han
     }
 
     if((lens == nullptr)
-       || ((dataType != HIP_R_16F) && (dataType != HIP_R_32F) && (dataType != HIP_R_64F))
+       || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F)
+           && (dataType != HIP_R_64F) && (dataType != HIP_C_32F)
+           && (dataType != HIP_C_64F))
        || unaryOp != HIPTENSOR_OP_IDENTITY)
     {
         auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE;
diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp
index 42197650..db9ff6c7 100644
--- a/library/src/include/data_types.hpp
+++ b/library/src/include/data_types.hpp
@@ -31,6 +31,7 @@
 // Include order needs to be preserved
 #include <hip/library_types.h>
 #include <hip/hip_bfloat16.h>
+#include <hip/hip_complex.h>
 #include <hip/hip_fp16.h>
 #include <iostream>
 
@@ -43,6 +44,46 @@ namespace hiptensor
     // Used to map to empty tensors
     struct NoneType;
 
+    struct ScalarData
+    {
+        hiptensorComputeType_t mType;
+        union
+        {
+            double           mReal;
+            hipDoubleComplex mComplex;
+        };
+
+        ScalarData() = default;
+        ScalarData(hiptensorComputeType_t type, double real, double imag = 0)
+        {
+            mType = type;
+            if(type == HIPTENSOR_COMPUTE_C32F || type == HIPTENSOR_COMPUTE_C64F)
+            {
+                mComplex = make_hipDoubleComplex(real, imag);
+            }
+            else
+            {
+                mReal = real;
+            }
+        }
+        operator float() const
+        {
+            return static_cast<float>(mReal);
+        }
+        operator double() const
+        {
+            return mReal;
+        }
+        operator hipFloatComplex() const
+        {
+            return hipComplexDoubleToFloat(mComplex);
+        }
+        operator hipDoubleComplex() const
+        {
+            return mComplex;
+        }
+    };
+
     static constexpr hipDataType NONE_TYPE = (hipDataType)31;
 
     // Map type to runtime HipDataType
@@ -65,6 +106,7 @@ namespace hiptensor
     template <typename T>
     T readVal(void const* value, hiptensorComputeType_t id);
 
+    void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value);
 } // namespace hiptensor
 
 bool operator==(hipDataType hipType, hiptensorComputeType_t computeType);
@@ -73,6 +115,11 @@ bool operator==(hiptensorComputeType_t computeType, hipDataType hipType);
 bool operator!=(hipDataType hipType, hiptensorComputeType_t computeType);
 bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType);
 
+namespace std
+{
+    std::string to_string(const hiptensor::ScalarData& value);
+}
+
 #include "data_types_impl.hpp"
 
 #endif // HIPTENSOR_LIBRARY_DATA_TYPES_HPP
diff --git a/library/src/include/data_types_impl.hpp b/library/src/include/data_types_impl.hpp
index 7df6d7d9..c55f0d7e 100644
--- a/library/src/include/data_types_impl.hpp
+++ b/library/src/include/data_types_impl.hpp
@@ -105,6 +105,18 @@ namespace hiptensor
         static constexpr auto value = HIP_R_64U;
     };
 
+    template <>
+    struct HipDataType<hipFloatComplex>
+    {
+        static constexpr auto value = HIP_C_32F;
+    };
+
+    template <>
+    struct HipDataType<hipDoubleComplex>
+    {
+        static constexpr auto value = HIP_C_64F;
+    };
+
     template <>
     struct HipDataType<NoneType>
     {
@@ -162,6 +174,14 @@ namespace hiptensor
         {
             return static_cast<T>(*(uint64_t*)value);
         }
+        else if constexpr(std::is_same_v<T, hipFloatComplex> && id == HIP_C_32F)
+        {
+            return static_cast<T>(*(hipFloatComplex*)value);
+        }
+        else if constexpr(std::is_same_v<T, hipDoubleComplex> && id == HIP_C_64F)
+        {
+            return static_cast<T>(*(hipDoubleComplex*)value);
+        }
         else
         {
 #if !NDEBUG
@@ -215,6 +235,8 @@ namespace hiptensor
         }
     }
 
+    template <>
+    ScalarData readVal(void const* value, hiptensorComputeType_t id);
 } // namespace hiptensor
 
 #endif // HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP
diff --git a/library/src/include/meta_traits.hpp b/library/src/include/meta_traits.hpp
index 0e039cd6..2cd0d740 100644
--- a/library/src/include/meta_traits.hpp
+++ b/library/src/include/meta_traits.hpp
@@ -32,7 +32,7 @@ namespace hiptensor
 
     // Placeholder for building traits on any type T
     // Use partial or full specialization for any class.
-    template <typename T>
+    template <typename T, typename Enabler = void>
     struct MetaTraits;
 
 } // namespace hiptensor
diff --git a/library/src/include/xfloat32.hpp b/library/src/include/xfloat32.hpp
deleted file mode 100644
index 6e9168cf..00000000
--- a/library/src/include/xfloat32.hpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/* ************************************************************************
- * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
- * ies of the Software, and to permit persons to whom the Software is furnished
- * to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
- * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
- * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
- * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
- * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-/*!\file
- * \brief xfloat32.h provides struct for hiptensor_xfloat32 typedef
- */
-
-#ifndef HIPTENSOR_XFLOAT32_HPP
-#define HIPTENSOR_XFLOAT32_HPP
-
-#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-
-// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
-// include a minimal definition of hiptensor_xfloat32
-
-#include <stdint.h>
-typedef struct
-{
-    float data;
-} hiptensor_xfloat32;
-
-#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <hip/hip_runtime.h>
-#include <ostream>
-#include <type_traits>
-
-#include "config.hpp"
-
-struct hiptensor_xfloat32
-{
-    float data;
-
-    enum round_t
-    {
-        round_up
-    };
-
-    HIPTENSOR_HOST_DEVICE hiptensor_xfloat32() = default;
-
-    // round upper 19 bits of IEEE float to convert to xfloat32
-    explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f, round_t)
-        : data(float_to_xfloat32(f))
-    {
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f)
-        : data(truncate_float_to_xfloat32(f))
-    {
-    }
-
-    // zero extend lower 13 bits of xfloat32 to convert to IEEE float
-    HIPTENSOR_HOST_DEVICE operator float() const
-    {
-        return data;
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE operator bool() const
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {data};
-        return u.int32 & 0x7fffe000;
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE operator uint32_t() const
-    {
-        return uint32_t(float(*this));
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE operator long() const
-    {
-        return long(float(*this));
-    }
-
-    explicit HIPTENSOR_HOST_DEVICE operator double() const
-    {
-        return double(float(*this));
-    }
-
-private:
-    static HIPTENSOR_HOST_DEVICE float float_to_xfloat32(float f)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {f};
-        if(~u.int32 & 0x7f800000)
-        {
-            // When the exponent bits are not all 1s, then the value is zero, normal,
-            // or subnormal. We round the xfloat32 mantissa up by adding 0xFFF, plus
-            // 1 if the least significant bit of the xfloat32 mantissa is 1 (odd).
-            // This causes the xfloat32's mantissa to be incremented by 1 if the 13
-            // least significant bits of the float mantissa are greater than 0x1000,
-            // or if they are equal to 0x1000 and the least significant bit of the
-            // xfloat32 mantissa is 1 (odd). This causes it to be rounded to even when
-            // the lower 13 bits are exactly 0x1000. If the xfloat32 mantissa already
-            // has the value 0x3ff, then incrementing it causes it to become 0x00 and
-            // the exponent is incremented by one, which is the next higher FP value
-            // to the unrounded xfloat32 value. When the xfloat32 value is subnormal
-            // with an exponent of 0x00 and a mantissa of 0x3FF, it may be rounded up
-            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-            // When the xfloat32 value has an exponent of 0xFE and a mantissa of 0x3FF,
-            // incrementing it causes it to become an exponent of 0xFF and a mantissa
-            // of 0x00, which is Inf, the next higher value to the unrounded value.
-
-            u.int32 += 0xfff + ((u.int32 >> 13) & 1); // Round to nearest, round to even
-        }
-        else if(u.int32 & 0x1fff)
-        {
-            // When all of the exponent bits are 1, the value is Inf or NaN.
-            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-            // bit being 1. Signaling NaN is indicated by the most significant
-            // mantissa bit being 0 but some other bit(s) being 1. If any of the
-            // lower 13 bits of the mantissa are 1, we set the least significant bit
-            // of the xfloat32 mantissa, in order to preserve signaling NaN in case
-            // the xfloat32's mantissa bits are all 0.
-            u.int32 |= 0x2000; // Preserve signaling NaN
-        }
-
-        u.int32 &= 0xffffe000;
-        return u.fp32;
-    }
-
-    // Truncate instead of rounding
-    static HIPTENSOR_HOST_DEVICE float truncate_float_to_xfloat32(float f)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {f};
-
-        u.int32 = u.int32 & 0xffffe000;
-        return u.fp32;
-    }
-};
-
-typedef struct
-{
-    float data;
-} hiptensor_xfloat32_public;
-
-static_assert(std::is_standard_layout<hiptensor_xfloat32>{},
-              "hiptensor_xfloat32 is not a standard layout type, and thus is "
-              "incompatible with C.");
-
-static_assert(std::is_trivial<hiptensor_xfloat32>{},
-              "hiptensor_xfloat32 is not a trivial type, and thus is "
-              "incompatible with C.");
-
-static_assert(sizeof(hiptensor_xfloat32) == sizeof(hiptensor_xfloat32_public)
-                  && offsetof(hiptensor_xfloat32, data)
-                         == offsetof(hiptensor_xfloat32_public, data),
-              "internal hiptensor_xfloat32 does not match public hiptensor_xfloat32");
-
-inline std::ostream& operator<<(std::ostream& os, const hiptensor_xfloat32& xf32)
-{
-    return os << float(xf32);
-}
-
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a)
-{
-    return a;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a)
-{
-    union
-    {
-        float    fp32;
-        uint32_t int32;
-    } u = {a.data};
-    u.int32 ^= 0x80000000;
-    return hiptensor_xfloat32(u.fp32);
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a,
-                                                          hiptensor_xfloat32 b)
-{
-    return hiptensor_xfloat32(float(a) + float(b));
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a,
-                                                          hiptensor_xfloat32 b)
-{
-    return hiptensor_xfloat32(float(a) - float(b));
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator*(hiptensor_xfloat32 a,
-                                                          hiptensor_xfloat32 b)
-{
-    return hiptensor_xfloat32(float(a) * float(b));
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator/(hiptensor_xfloat32 a,
-                                                          hiptensor_xfloat32 b)
-{
-    return hiptensor_xfloat32(float(a) / float(b));
-}
-inline HIPTENSOR_HOST_DEVICE bool operator<(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return float(a) < float(b);
-}
-inline HIPTENSOR_HOST_DEVICE bool operator==(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return float(a) == float(b);
-}
-inline HIPTENSOR_HOST_DEVICE bool operator>(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return b < a;
-}
-inline HIPTENSOR_HOST_DEVICE bool operator<=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return !(a > b);
-}
-inline HIPTENSOR_HOST_DEVICE bool operator!=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return !(a == b);
-}
-inline HIPTENSOR_HOST_DEVICE bool operator>=(hiptensor_xfloat32 a, hiptensor_xfloat32 b)
-{
-    return !(a < b);
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator+=(hiptensor_xfloat32& a,
-                                                            hiptensor_xfloat32  b)
-{
-    return a = a + b;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator-=(hiptensor_xfloat32& a,
-                                                            hiptensor_xfloat32  b)
-{
-    return a = a - b;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator*=(hiptensor_xfloat32& a,
-                                                            hiptensor_xfloat32  b)
-{
-    return a = a * b;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator/=(hiptensor_xfloat32& a,
-                                                            hiptensor_xfloat32  b)
-{
-    return a = a / b;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator++(hiptensor_xfloat32& a)
-{
-    return a += hiptensor_xfloat32(1.0f);
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator--(hiptensor_xfloat32& a)
-{
-    return a -= hiptensor_xfloat32(1.0f);
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator++(hiptensor_xfloat32& a, int)
-{
-    hiptensor_xfloat32 orig = a;
-    ++a;
-    return orig;
-}
-inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator--(hiptensor_xfloat32& a, int)
-{
-    hiptensor_xfloat32 orig = a;
-    --a;
-    return orig;
-}
-
-namespace std
-{
-    constexpr HIPTENSOR_HOST_DEVICE bool isinf(hiptensor_xfloat32 a)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {a.data};
-        return !(~u.int32 & 0x7f800000) && !(u.int32 & 0x7fe000);
-    }
-    constexpr HIPTENSOR_HOST_DEVICE bool isnan(hiptensor_xfloat32 a)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {a.data};
-        return !(~u.int32 & 0x7f800000) && +(u.int32 & 0x7fe000);
-    }
-    constexpr HIPTENSOR_HOST_DEVICE bool iszero(hiptensor_xfloat32 a)
-    {
-        union
-        {
-            float    fp32;
-            uint32_t int32;
-        } u = {a.data};
-        return (u.fp32 == 0.0f);
-    }
-
-    HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 sin(hiptensor_xfloat32 a)
-    {
-        return hiptensor_xfloat32(sinf(float(a)));
-    }
-    HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 cos(hiptensor_xfloat32 a)
-    {
-        return hiptensor_xfloat32(cosf(float(a)));
-    }
-
-    HIPTENSOR_HOST_DEVICE constexpr hiptensor_xfloat32 real(const hiptensor_xfloat32& a)
-    {
-        return a;
-    }
-}
-
-#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
-
-#endif // HIPTENSOR_XFLOAT32_HPP
diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp
index c1d4a3af..4820274f 100644
--- a/library/src/permutation/permutation_cpu_reference_impl.hpp
+++ b/library/src/permutation/permutation_cpu_reference_impl.hpp
@@ -92,7 +92,7 @@ namespace hiptensor
                 auto bOffset
                     = std::inner_product(bIndices.rbegin(), bIndices.rend(), bStrides.rbegin(), 0);
 #endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
-                B[bOffset] = static_cast<DataType>(A[elementIndex] * alphaValue);
+                B[bOffset] = static_cast<DataType>(A[elementIndex] * (DataType)alphaValue);
             }
 
             return HIPTENSOR_STATUS_SUCCESS;
diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt
index ada3ce61..d255c0e4 100644
--- a/samples/01_contraction/CMakeLists.txt
+++ b/samples/01_contraction/CMakeLists.txt
@@ -26,16 +26,65 @@
 
 # Check whether building within hiptensor context
 if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
-    add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
-    add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32 simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp)
+    add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp)
+    add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_cf32_cf32_cf32_compute_cf32 simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp)
+    add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp)
 
 # If building hipTensor samples as a standalone Cmake project
 else()
+    add_executable(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp)
+    target_link_libraries(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp)
-    target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor)
 
-    add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp)
-    target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor)
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp)
+    target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp)
+    target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp)
+    target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp)
+    target_link_libraries(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp)
+    target_link_libraries(simple_scale_contraction_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp)
+    target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp)
+    target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor)
+
+    add_executable(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp)
+    target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor)
 
 endif()
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction.hpp
similarity index 86%
rename from samples/01_contraction/simple_bilinear_contraction_f32.cpp
rename to samples/01_contraction/simple_bilinear_contraction.hpp
index 5704a59d..95c5d0f6 100644
--- a/samples/01_contraction/simple_bilinear_contraction_f32.cpp
+++ b/samples/01_contraction/simple_bilinear_contraction.hpp
@@ -28,37 +28,21 @@
 #include <hiptensor/hiptensor.hpp>
 #include <hiptensor/hiptensor_types.hpp>
 #include <hiptensor/internal/hiptensor_utility.hpp>
-#include <iostream>
 #include <iterator>
 #include <numeric>
 #include <unordered_map>
 
 #include "common.hpp"
 
-int main(int argc, char* argv[])
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          hipDataType            typeA,
+          hipDataType            typeB,
+          hipDataType            typeC,
+          hiptensorComputeType_t typeCompute>
+int bilinearContractionSample(void* alpha, void* beta)
 {
-    /***************************************
-   * Check device support                 *
-   **************************************/
-    if(!isF32Supported())
-    {
-        std::cout << "unsupported host device" << std::endl;
-        exit(EXIT_FAILURE);
-    }
-
-    typedef float ADataType;
-    typedef float BDataType;
-    typedef float CDataType;
-    typedef float floatTypeCompute;
-
-    hipDataType            typeA       = HIP_R_32F;
-    hipDataType            typeB       = HIP_R_32F;
-    hipDataType            typeC       = HIP_R_32F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.1f;
-    floatTypeCompute beta  = (floatTypeCompute)1.0f;
-
     /**********************
    * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta *
    *C_{m,n,u,v}
@@ -74,12 +58,12 @@ int main(int argc, char* argv[])
 
     std::unordered_map<int, int64_t> extent;
 
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
+    extent['m'] = 4;
+    extent['n'] = 3;
+    extent['u'] = 4;
+    extent['v'] = 3;
+    extent['h'] = 6;
+    extent['k'] = 5;
 
     std::vector<int64_t> c_ms_ns_lengths;
     for(auto mode : modeC)
@@ -166,19 +150,41 @@ int main(int argc, char* argv[])
     /*******************
    * Initialize data
    *******************/
+    int initMethod = 1; // TODO read value from commandline
     for(int64_t i = 0; i < elementsA; i++)
     {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+        if(initMethod == 0)
+        {
+            A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            A[i] = (ADataType)(float(i) / 100);
+        }
     }
 
     for(int64_t i = 0; i < elementsB; i++)
     {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+        if(initMethod == 0)
+        {
+            B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            B[i] = (BDataType)(float(i) / 100);
+        }
     }
 
     for(int64_t i = 0; i < elementsC; i++)
     {
-        C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+        if(initMethod == 0)
+        {
+            C[i] = CDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            C[i] = (BDataType)(float(i) / 100);
+        }
     }
 
     /********************************************
@@ -193,7 +199,6 @@ int main(int argc, char* argv[])
     /************************************************
    * Retrieve the memory alignment for each tensor
    ************************************************/
-
     uint32_t alignmentRequirementA;
     CHECK_HIPTENSOR_ERROR(
         hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA));
@@ -262,27 +267,13 @@ int main(int argc, char* argv[])
 
     std::cout << "Launching contraction kernel..." << std::endl;
 
-    CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
-                                               &plan,
-                                               (void*)&alpha,
-                                               A_d,
-                                               B_d,
-                                               (void*)&beta,
-                                               C_d,
-                                               C_d,
-                                               workspace,
-                                               worksize,
-                                               0 /* stream */));
+    CHECK_HIPTENSOR_ERROR(hiptensorContraction(
+        handle, &plan, alpha, A_d, B_d, beta, C_d, C_d, workspace, worksize, 0 /* stream */));
 
 #if !NDEBUG
     bool printElements = false;
     bool storeElements = false;
 
-    if(printElements || storeElements)
-    {
-        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
-    }
-
     if(printElements)
     {
         if(elementsA < MAX_ELEMENTS_PRINT_COUNT)
@@ -305,6 +296,15 @@ int main(int argc, char* argv[])
             hiptensorPrintArrayElements(std::cout, C, elementsC);
             std::cout << std::endl;
         }
+
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+
+        if(elementsC < MAX_ELEMENTS_PRINT_COUNT)
+        {
+            std::cout << "Tensor D elements:\n";
+            hiptensorPrintArrayElements(std::cout, C, elementsC);
+            std::cout << std::endl;
+        }
     }
 
     if(storeElements)
@@ -318,6 +318,12 @@ int main(int argc, char* argv[])
         hiptensorPrintElementsToFile(tensorB, B, elementsB, ", ");
         tensorB.close();
 
+        tensorC.open("tensor_C.txt");
+        hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
+        tensorC.close();
+
+        CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost));
+
         tensorC.open("tensor_C_scale_contraction_results.txt");
         hiptensorPrintElementsToFile(tensorC, C, elementsC, ", ");
         tensorC.close();
diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
new file mode 100644
index 00000000..52915200
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef hip_bfloat16 ADataType;
+    typedef hip_bfloat16 BDataType;
+    typedef hip_bfloat16 CDataType;
+    typedef float        floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_16BF;
+    constexpr hipDataType            typeB       = HIP_R_16BF;
+    constexpr hipDataType            typeC       = HIP_R_16BF;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>(&alpha, &beta);
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp
new file mode 100644
index 00000000..5b3bb7cc
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef hipFloatComplex ADataType;
+    typedef hipFloatComplex BDataType;
+    typedef hipFloatComplex CDataType;
+    typedef hipFloatComplex floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_C_32F;
+    constexpr hipDataType            typeB       = HIP_C_32F;
+    constexpr hipDataType            typeC       = HIP_C_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F;
+
+    floatTypeCompute alpha{1.0f, 1.0f};
+    floatTypeCompute beta{1.0f, 1.0f};
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>(&alpha, &beta);
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
new file mode 100644
index 00000000..8de0c534
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef _Float16 ADataType;
+    typedef _Float16 BDataType;
+    typedef _Float16 CDataType;
+    typedef float    floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_16F;
+    constexpr hipDataType            typeB       = HIP_R_16F;
+    constexpr hipDataType            typeC       = HIP_R_16F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>(&alpha, &beta);
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
new file mode 100644
index 00000000..6ce6d3c0
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float        ADataType;
+    typedef float        BDataType;
+    typedef float        CDataType;
+    typedef hip_bfloat16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF;
+
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>(&alpha, &beta);
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
new file mode 100644
index 00000000..d4e28761
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float    ADataType;
+    typedef float    BDataType;
+    typedef float    CDataType;
+    typedef _Float16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F;
+
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>(&alpha, &beta);
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
new file mode 100644
index 00000000..e493f1c3
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float ADataType;
+    typedef float BDataType;
+    typedef float CDataType;
+    typedef float floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeC       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>(&alpha, &beta);
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
new file mode 100644
index 00000000..0faffc3e
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double CDataType;
+    typedef float  floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeC       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>(&alpha, &beta);
+}
diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
new file mode 100644
index 00000000..d5024eba
--- /dev/null
+++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_bilinear_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double CDataType;
+    typedef double floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeC       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F;
+
+    floatTypeCompute alpha{1.0f};
+    floatTypeCompute beta{1.0f};
+    return bilinearContractionSample<ADataType,
+                                     BDataType,
+                                     CDataType,
+                                     typeA,
+                                     typeB,
+                                     typeC,
+                                     typeCompute>(&alpha, &beta);
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction.hpp
similarity index 91%
rename from samples/01_contraction/simple_scale_contraction_f32.cpp
rename to samples/01_contraction/simple_scale_contraction.hpp
index c76ec370..5db4598d 100644
--- a/samples/01_contraction/simple_scale_contraction_f32.cpp
+++ b/samples/01_contraction/simple_scale_contraction.hpp
@@ -34,29 +34,15 @@
 
 #include "common.hpp"
 
-int main(int argc, char* argv[])
+template <typename ADataType,
+          typename BDataType,
+          typename DDataType,
+          hipDataType            typeA,
+          hipDataType            typeB,
+          hipDataType            typeD,
+          hiptensorComputeType_t typeCompute>
+int scaleContractionSample(void* alpha)
 {
-    /***************************************
-   * Check device support                 *
-   **************************************/
-    if(!isF32Supported())
-    {
-        std::cout << "unsupported host device" << std::endl;
-        exit(EXIT_FAILURE);
-    }
-
-    typedef float ADataType;
-    typedef float BDataType;
-    typedef float DDataType;
-    typedef float floatTypeCompute;
-
-    hipDataType            typeA       = HIP_R_32F;
-    hipDataType            typeB       = HIP_R_32F;
-    hipDataType            typeD       = HIP_R_32F;
-    hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
-
-    floatTypeCompute alpha = (floatTypeCompute)1.0f;
-
     /**********************
    * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v}
    **********************/
@@ -71,12 +57,12 @@ int main(int argc, char* argv[])
 
     std::unordered_map<int, int64_t> extent;
 
-    extent['m'] = 5;
-    extent['n'] = 6;
-    extent['u'] = 3;
-    extent['v'] = 4;
-    extent['h'] = 3;
-    extent['k'] = 4;
+    extent['m'] = 4;
+    extent['n'] = 3;
+    extent['u'] = 4;
+    extent['v'] = 3;
+    extent['h'] = 6;
+    extent['k'] = 5;
 
     std::vector<int64_t> d_ms_ns_lengths;
     for(auto mode : modeD)
@@ -163,14 +149,29 @@ int main(int argc, char* argv[])
     /*******************
    * Initialize data
    *******************/
+    int initMethod = 1; // TODO read the value from command line
     for(int64_t i = 0; i < elementsA; i++)
     {
-        A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+        if(initMethod == 0)
+        {
+            A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            A[i] = (ADataType)(float(i) / 100);
+        }
     }
 
     for(int64_t i = 0; i < elementsB; i++)
     {
-        B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100;
+        if(initMethod == 0)
+        {
+            B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100;
+        }
+        else
+        {
+            B[i] = (BDataType)(float(i) / 100);
+        }
     }
 
     for(int64_t i = 0; i < elementsD; i++)
@@ -260,7 +261,7 @@ int main(int argc, char* argv[])
 
     CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
                                                &plan,
-                                               (void*)&alpha,
+                                               alpha,
                                                A_d,
                                                B_d,
                                                nullptr,
@@ -270,8 +271,6 @@ int main(int argc, char* argv[])
                                                worksize,
                                                0 /* stream */));
 
-    CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost));
-
 #if !NDEBUG
     bool printElements = false;
     bool storeElements = false;
diff --git a/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
new file mode 100644
index 00000000..5a991dbc
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    typedef hip_bfloat16 ADataType;
+    typedef hip_bfloat16 BDataType;
+    typedef hip_bfloat16 DDataType;
+    typedef float        floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_16BF;
+    constexpr hipDataType            typeB       = HIP_R_16BF;
+    constexpr hipDataType            typeD       = HIP_R_16BF;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = 1;
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>(&alpha);
+}
diff --git a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp
new file mode 100644
index 00000000..a3eb5e6f
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef hipFloatComplex ADataType;
+    typedef hipFloatComplex BDataType;
+    typedef hipFloatComplex DDataType;
+    typedef hipFloatComplex floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_C_32F;
+    constexpr hipDataType            typeB       = HIP_C_32F;
+    constexpr hipDataType            typeD       = HIP_C_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F;
+
+    floatTypeCompute alpha(1, 1);
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>(&alpha);
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
new file mode 100644
index 00000000..9283283b
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef _Float16 ADataType;
+    typedef _Float16 BDataType;
+    typedef _Float16 DDataType;
+    typedef float    floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_16F;
+    constexpr hipDataType            typeB       = HIP_R_16F;
+    constexpr hipDataType            typeD       = HIP_R_16F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = 1;
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>(&alpha);
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
new file mode 100644
index 00000000..dac5e18b
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float        ADataType;
+    typedef float        BDataType;
+    typedef float        DDataType;
+    typedef hip_bfloat16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF;
+
+    floatTypeCompute alpha = floatTypeCompute{1.0f};
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>(&alpha);
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
new file mode 100644
index 00000000..155f9585
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float    ADataType;
+    typedef float    BDataType;
+    typedef float    DDataType;
+    typedef _Float16 floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F;
+
+    floatTypeCompute alpha = 1;
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>(&alpha);
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
new file mode 100644
index 00000000..2def291d
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF32Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef float ADataType;
+    typedef float BDataType;
+    typedef float DDataType;
+    typedef float floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_32F;
+    constexpr hipDataType            typeB       = HIP_R_32F;
+    constexpr hipDataType            typeD       = HIP_R_32F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = 1;
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>(&alpha);
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
new file mode 100644
index 00000000..7b2a9c95
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF64Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double DDataType;
+    typedef float  floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeD       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F;
+
+    floatTypeCompute alpha = 1;
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>(&alpha);
+}
diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp
new file mode 100644
index 00000000..201741e9
--- /dev/null
+++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *******************************************************************************/
+#include "simple_scale_contraction.hpp"
+
+int main(int argc, char* argv[])
+{
+    /***************************************
+   * Check device support                 *
+   **************************************/
+    if(!isF64Supported())
+    {
+        std::cout << "unsupported host device" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    typedef double ADataType;
+    typedef double BDataType;
+    typedef double DDataType;
+    typedef double floatTypeCompute;
+
+    constexpr hipDataType            typeA       = HIP_R_64F;
+    constexpr hipDataType            typeB       = HIP_R_64F;
+    constexpr hipDataType            typeD       = HIP_R_64F;
+    constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F;
+
+    floatTypeCompute alpha = 1;
+    return scaleContractionSample<ADataType,
+                                  BDataType,
+                                  DDataType,
+                                  typeA,
+                                  typeB,
+                                  typeD,
+                                  typeCompute>(&alpha);
+}
diff --git a/samples/02_permutation/CMakeLists.txt b/samples/02_permutation/CMakeLists.txt
index 68857b54..ab66798c 100644
--- a/samples/02_permutation/CMakeLists.txt
+++ b/samples/02_permutation/CMakeLists.txt
@@ -26,7 +26,7 @@
 
 # Check whether building within hiptensor context
 if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" )
-    add_hiptensor_sample(permutation permutation.cpp)
+    add_hiptensor_sample(simple_permutation permutation.cpp)
 # If building hipTensor samples as a standalone Cmake project
 else()
     add_executable(permutation permutation.cpp)
diff --git a/test/00_unit/yaml_test.cpp b/test/00_unit/yaml_test.cpp
index 2efc6b6e..57a86a25 100644
--- a/test/00_unit/yaml_test.cpp
+++ b/test/00_unit/yaml_test.cpp
@@ -54,8 +54,8 @@ namespace hiptensor
 
         using LengthsT = std::vector<std::size_t>;
         using StridesT = std::vector<std::size_t>;
-        using AlphaT   = double;
-        using BetaT    = double;
+        using AlphaT   = std::vector<double>;
+        using BetaT    = std::vector<double>;
 
         //Data types of input and output tensors
         std::vector<TestTypesT>    mDataTypes;
@@ -79,9 +79,13 @@ int main(int argc, char* argv[])
     yee.mDataTypes    = {
         // clang-format off
                 {HIP_R_32F, HIP_R_32F, hiptensor::NONE_TYPE, HIP_R_32F, HIP_R_32F}, // scale F32
+                {HIP_C_32F, HIP_C_32F, hiptensor::NONE_TYPE, HIP_C_32F, HIP_C_32F}, // scale F32 Complex
                 {HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F}, // bilinear F32
+                {HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F}, // bilinear F32 Complex
                 {HIP_R_64F, HIP_R_64F, hiptensor::NONE_TYPE, HIP_R_64F, HIP_R_64F}, // scale F64
+                {HIP_C_64F, HIP_C_64F, hiptensor::NONE_TYPE, HIP_C_64F, HIP_C_64F}, // scale F64 Complex
                 {HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F}, // bilinear F64
+                {HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F}, // bilinear F64 Complex
         // clang-format on
     };
     yee.mAlgorithms
@@ -94,8 +98,8 @@ int main(int argc, char* argv[])
     yee.mProblemLengths
         = {{5, 6, 7, 8, 4, 2, 3, 4}, {1, 2, 3, 4}, {99, 12, 44, 31, 59, 23, 54, 22}};
     yee.mProblemStrides = {{}};
-    yee.mAlphas         = {0, 1, 1};
-    yee.mBetas          = {2, 2, 2};
+    yee.mAlphas         = {{0}, {1}, {1}};
+    yee.mBetas          = {{2}, {2}, {2}};
 
     struct TmpFileWrapper
     {
diff --git a/test/01_contraction/CMakeLists.txt b/test/01_contraction/CMakeLists.txt
index fe2d7a87..a59eeefd 100644
--- a/test/01_contraction/CMakeLists.txt
+++ b/test/01_contraction/CMakeLists.txt
@@ -33,10 +33,20 @@ set (BilinearContractionTestSources ${ContractionCommonSources}
 set (BilinearContractionTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/bilinear_test_params.yaml)
 add_hiptensor_test(bilinear_contraction_test ${BilinearContractionTestConfig}  ${BilinearContractionTestSources})
 
+# Complex Bilinear tests
+set (ComplexBilinearContractionTestSources ${ContractionCommonSources}
+    ${CMAKE_CURRENT_SOURCE_DIR}/complex_bilinear_contraction_test.cpp)
+set (ComplexBilinearContractionTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/complex_bilinear_test_params.yaml)
+add_hiptensor_test(complex_bilinear_contraction_test ${ComplexBilinearContractionTestConfig}  ${ComplexBilinearContractionTestSources})
+
 # Scale tests
 set (ScaleContractionTestSources ${ContractionCommonSources}
                                     ${CMAKE_CURRENT_SOURCE_DIR}/scale_contraction_test.cpp)
 set (ScaleContractionTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/scale_test_params.yaml)
 add_hiptensor_test(scale_contraction_test ${ScaleContractionTestConfig} ${ScaleContractionTestSources})
 
-
+# Complex Scale tests
+set (ComplexScaleContractionTestSources ${ContractionCommonSources}
+    ${CMAKE_CURRENT_SOURCE_DIR}/complex_scale_contraction_test.cpp)
+set (ComplexScaleContractionTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/complex_scale_test_params.yaml)
+add_hiptensor_test(complex_scale_contraction_test ${ComplexScaleContractionTestConfig}  ${ComplexScaleContractionTestSources})
diff --git a/test/01_contraction/complex_bilinear_contraction_test.cpp b/test/01_contraction/complex_bilinear_contraction_test.cpp
new file mode 100644
index 00000000..51e95c34
--- /dev/null
+++ b/test/01_contraction/complex_bilinear_contraction_test.cpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+
+#include "contraction_test.hpp"
+#include "contraction_test_helpers.hpp"
+
+class ComplexBilinearContractionTest : public hiptensor::ContractionTest
+{
+};
+
+TEST_P(ComplexBilinearContractionTest, RunKernel)
+{
+    static bool ranWarmup = false;
+    if(!ranWarmup)
+    {
+        this->Warmup();
+        ranWarmup = true;
+    }
+    this->RunKernel();
+}
+
+INSTANTIATE_TEST_SUITE_P(ContractionTests, ComplexBilinearContractionTest, load_config_helper());
diff --git a/test/01_contraction/complex_scale_contraction_test.cpp b/test/01_contraction/complex_scale_contraction_test.cpp
new file mode 100644
index 00000000..3995651b
--- /dev/null
+++ b/test/01_contraction/complex_scale_contraction_test.cpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <hiptensor/hiptensor.hpp>
+#include <hiptensor/hiptensor_types.hpp>
+
+#include "contraction_test.hpp"
+#include "contraction_test_helpers.hpp"
+
+class ComplexScaleContractionTest : public hiptensor::ContractionTest
+{
+};
+
+TEST_P(ComplexScaleContractionTest, RunKernel)
+{
+    static bool ranWarmup = false;
+    if(!ranWarmup)
+    {
+        this->Warmup();
+        ranWarmup = true;
+    }
+    this->RunKernel();
+}
+
+INSTANTIATE_TEST_SUITE_P(ContractionTests, ComplexScaleContractionTest, load_config_helper());
diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml
index 2bd90e90..9306445a 100644
--- a/test/01_contraction/configs/bilinear_test_params.yaml
+++ b/test/01_contraction/configs/bilinear_test_params.yaml
@@ -1,8 +1,13 @@
 ---
 Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
-  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F]
-  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F]
+  - [ HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_32F ]
+  - [ HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16F ]
+  - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ]
+  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ]
+  - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
@@ -14,13 +19,13 @@ Worksize Prefs:
   - HIPTENSOR_WORKSPACE_MIN
   - HIPTENSOR_WORKSPACE_MAX
 Alphas:
-  - 0
-  - 1
-  - 1
+  - [0]
+  - [1]
+  - [1]
 Betas:
-  - 2
-  - 0
-  - 2
+  - [2]
+  - [0]
+  - [2]
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml
new file mode 100644
index 00000000..dfbb814e
--- /dev/null
+++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml
@@ -0,0 +1,30 @@
+---
+Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
+Tensor Data Types:
+  - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F ]
+  - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F ]
+Algorithm Types:
+  - HIPTENSOR_ALGO_DEFAULT
+  - HIPTENSOR_ALGO_DEFAULT_PATIENT
+  - HIPTENSOR_ALGO_ACTOR_CRITIC
+Operators:
+  - HIPTENSOR_OP_IDENTITY
+Worksize Prefs:
+  - HIPTENSOR_WORKSPACE_RECOMMENDED
+  - HIPTENSOR_WORKSPACE_MIN
+  - HIPTENSOR_WORKSPACE_MAX
+Alphas:
+  - [0, 0]
+  - [1, 1]
+  - [1.1, 1.2]
+Betas:
+  - [2, 2]
+  - [0, 0]
+  - [2.2, 2.3]
+Lengths:
+  - [ 5, 6, 3, 4, 3, 4 ]
+  - [ 4, 3, 4, 3, 6, 5 ]
+  - [ 24, 18, 2, 4, 9, 1 ]
+Strides:
+  - []
+...
diff --git a/test/01_contraction/configs/complex_scale_test_params.yaml b/test/01_contraction/configs/complex_scale_test_params.yaml
new file mode 100644
index 00000000..4bad2a9b
--- /dev/null
+++ b/test/01_contraction/configs/complex_scale_test_params.yaml
@@ -0,0 +1,30 @@
+---
+Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
+Tensor Data Types:
+  - [ HIP_C_32F, HIP_C_32F, NONE_TYPE, HIP_C_32F, HIP_C_32F ]
+  - [ HIP_C_64F, HIP_C_64F, NONE_TYPE, HIP_C_64F, HIP_C_64F ]
+Algorithm Types:
+  - HIPTENSOR_ALGO_DEFAULT
+  - HIPTENSOR_ALGO_DEFAULT_PATIENT
+  - HIPTENSOR_ALGO_ACTOR_CRITIC
+Operators:
+  - HIPTENSOR_OP_IDENTITY
+Worksize Prefs:
+  - HIPTENSOR_WORKSPACE_RECOMMENDED
+  - HIPTENSOR_WORKSPACE_MIN
+  - HIPTENSOR_WORKSPACE_MAX
+Alphas:
+  - [0, 0]
+  - [1, 1]
+  - [1.1, 1.2]
+Betas:
+  - [2, 2]
+  - [0, 0]
+  - [2.2, 2.3]
+Lengths:
+  - [ 5, 6, 3, 4, 3, 4 ]
+  - [ 4, 3, 4, 3, 6, 5 ]
+  - [ 24, 18, 2, 4, 9, 1 ]
+Strides:
+  - []
+...
diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml
index 329f1b84..4c52eeda 100644
--- a/test/01_contraction/configs/scale_test_params.yaml
+++ b/test/01_contraction/configs/scale_test_params.yaml
@@ -1,8 +1,13 @@
 ---
 Log Level:       [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ]
 Tensor Data Types:
+  - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ]
+  - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ]
   - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ]
+  - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ]
   - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ]
+  - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ]
 Algorithm Types:
   - HIPTENSOR_ALGO_DEFAULT
   - HIPTENSOR_ALGO_DEFAULT_PATIENT
@@ -14,13 +19,13 @@ Worksize Prefs:
   - HIPTENSOR_WORKSPACE_MIN
   - HIPTENSOR_WORKSPACE_MAX
 Alphas:
-  - 0
-  - 1
-  - 1
+  - [0]
+  - [1]
+  - [1]
 Betas:
-  - 2
-  - 0
-  - 2
+  - [2]
+  - [0]
+  - [2]
 Lengths:
   - [ 5, 6, 3, 4, 3, 4 ]
   - [ 4, 3, 4, 3, 6, 5 ]
diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp
index 5d745d12..664da2ec 100644
--- a/test/01_contraction/contraction_test.cpp
+++ b/test/01_contraction/contraction_test.cpp
@@ -56,8 +56,10 @@ namespace hiptensor
     // False = skip test
     bool ContractionTest::checkDevice(hipDataType datatype) const
     {
-        return (isF32Supported() && datatype == HIP_R_32F)
-               || (isF64Supported() && datatype == HIP_R_64F);
+        return (isF32Supported()
+                && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF
+                    || datatype == HIP_C_32F))
+               || (isF64Supported() && (datatype == HIP_R_64F || datatype == HIP_C_64F));
     }
 
     bool ContractionTest::checkSizes() const
@@ -115,11 +117,23 @@ namespace hiptensor
         auto CDataType = testType[2];
         auto DDataType = testType[3];
 
-        EXPECT_TRUE((ADataType == HIP_R_32F) || (ADataType == HIP_R_64F));
-        EXPECT_TRUE((BDataType == HIP_R_32F) || (BDataType == HIP_R_64F));
-        EXPECT_TRUE((CDataType == HIP_R_32F) || (CDataType == HIP_R_64F)
+        EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF)
+                    || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)
+                    || (ADataType == HIP_C_32F) || (ADataType == HIP_C_64F));
+        EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF)
+                    || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)
+                    || (BDataType == HIP_C_32F) || (BDataType == HIP_C_64F));
+        EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF)
+                    || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F)
+                    || (CDataType == HIP_C_32F) || (CDataType == HIP_C_64F)
                     || (CDataType == NONE_TYPE));
-        EXPECT_TRUE((DDataType == HIP_R_32F) || (DDataType == HIP_R_64F));
+        EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF)
+                    || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)
+                    || (DDataType == HIP_C_32F) || (DDataType == HIP_C_64F));
+        EXPECT_TRUE(
+            (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF)
+            || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F)
+            || (computeType == HIPTENSOR_COMPUTE_C32F) || (computeType == HIPTENSOR_COMPUTE_C64F));
 
         mRunFlag &= checkDevice(DDataType);
 
@@ -228,7 +242,35 @@ namespace hiptensor
             auto resource = getResource();
             resource->resizeStorage(lengths, elementBytes);
 
-            if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F)
+            if(ADataType == HIP_R_16F && BDataType == HIP_R_16F && DDataType == HIP_R_16F)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<_Float16>((_Float16*)resource->deviceA().get(), elementsA);
+                fillLaunchKernel<_Float16>((_Float16*)resource->deviceB().get(), elementsB);
+                if(CDataType == HIP_R_16F)
+                {
+                    fillLaunchKernel<_Float16>((_Float16*)resource->deviceC().get(), elementsCD);
+                }
+                fillValLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(),
+                                              elementsCD,
+                                              std::numeric_limits<_Float16>::signaling_NaN());
+            }
+            else if(ADataType == HIP_R_16BF && BDataType == HIP_R_16BF && DDataType == HIP_R_16BF)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceA().get(), elementsA);
+                fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceB().get(), elementsB);
+                if(CDataType == HIP_R_16BF)
+                {
+                    fillLaunchKernel<hip_bfloat16>((hip_bfloat16*)resource->deviceC().get(),
+                                                   elementsCD);
+                }
+                fillValLaunchKernel<hip_bfloat16>(
+                    (hip_bfloat16*)resource->deviceD().get(),
+                    elementsCD,
+                    std::numeric_limits<hip_bfloat16>::signaling_NaN());
+            }
+            else if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F)
             {
                 // Initialize matrix data on device
                 fillLaunchKernel<float>((float*)resource->deviceA().get(), elementsA);
@@ -254,6 +296,40 @@ namespace hiptensor
                                             elementsCD,
                                             std::numeric_limits<double>::signaling_NaN());
             }
+            else if(ADataType == HIP_C_32F && BDataType == HIP_C_32F && DDataType == HIP_C_32F)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceA().get(),
+                                                  elementsA);
+                fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceB().get(),
+                                                  elementsB);
+                if(CDataType == HIP_C_32F)
+                {
+                    fillLaunchKernel<hipFloatComplex>((hipFloatComplex*)resource->deviceC().get(),
+                                                      elementsCD);
+                }
+                fillValLaunchKernel<hipFloatComplex>(
+                    (hipFloatComplex*)resource->deviceD().get(),
+                    elementsCD,
+                    std::numeric_limits<hipFloatComplex>::signaling_NaN());
+            }
+            else if(ADataType == HIP_C_64F && BDataType == HIP_C_64F && DDataType == HIP_C_64F)
+            {
+                // Initialize matrix data on device
+                fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceA().get(),
+                                                   elementsA);
+                fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceB().get(),
+                                                   elementsB);
+                if(CDataType == HIP_C_64F)
+                {
+                    fillLaunchKernel<hipDoubleComplex>((hipDoubleComplex*)resource->deviceC().get(),
+                                                       elementsCD);
+                }
+                fillValLaunchKernel<hipDoubleComplex>(
+                    (hipDoubleComplex*)resource->deviceD().get(),
+                    elementsCD,
+                    std::numeric_limits<hipDoubleComplex>::signaling_NaN());
+            }
 
             resource->copyDeviceToHostAll(elementBytes);
 
@@ -328,7 +404,7 @@ namespace hiptensor
             {
                 auto resource = getResource();
 
-                int size = ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double));
+                int size = hipDataTypeSize(DDataType);
 
                 size_t elementsA  = std::accumulate(a_ms_ks.mLengths.begin(),
                                                    a_ms_ks.mLengths.end(),
@@ -346,7 +422,50 @@ namespace hiptensor
                 auto D = resource->allocHost(elementsCD * size);
                 resource->copyData(D, resource->deviceD(), elementsCD * size);
 
-                if(DDataType == HIP_R_32F)
+                if(DDataType == HIP_R_16F)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(
+                        stream, (_Float16*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<_Float16>(stream, (_Float16*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
+                else if(DDataType == HIP_R_16BF)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<hip_bfloat16>(
+                        stream, (hip_bfloat16*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
+                else if(DDataType == HIP_R_32F)
                 {
                     stream << "Tensor A elements:\n";
                     hiptensorPrintArrayElements<float>(
@@ -367,7 +486,7 @@ namespace hiptensor
                     hiptensorPrintArrayElements<float>(stream, (float*)D.get(), elementsCD);
                     stream << std::endl;
                 }
-                else
+                else if(DDataType == HIP_R_64F)
                 {
                     stream << "Tensor A elements:\n";
                     hiptensorPrintArrayElements<double>(
@@ -388,6 +507,50 @@ namespace hiptensor
                     hiptensorPrintArrayElements<double>(stream, (double*)D.get(), elementsCD);
                     stream << std::endl;
                 }
+                else if(DDataType == HIP_C_32F)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<hipFloatComplex>(
+                        stream, (hipFloatComplex*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<hipFloatComplex>(
+                        stream, (hipFloatComplex*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<hipFloatComplex>(
+                        stream, (hipFloatComplex*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<hipFloatComplex>(
+                        stream, (hipFloatComplex*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
+                else if(DDataType == HIP_C_64F)
+                {
+                    stream << "Tensor A elements:\n";
+                    hiptensorPrintArrayElements<hipDoubleComplex>(
+                        stream, (hipDoubleComplex*)resource->hostA().get(), elementsA);
+                    stream << std::endl;
+
+                    stream << "Tensor B elements:\n";
+                    hiptensorPrintArrayElements<hipDoubleComplex>(
+                        stream, (hipDoubleComplex*)resource->hostB().get(), elementsB);
+                    stream << std::endl;
+
+                    stream << "Tensor C elements:\n";
+                    hiptensorPrintArrayElements<hipDoubleComplex>(
+                        stream, (hipDoubleComplex*)resource->hostC().get(), elementsCD);
+                    stream << std::endl;
+
+                    stream << "Tensor D elements:\n";
+                    hiptensorPrintArrayElements<hipDoubleComplex>(
+                        stream, (hipDoubleComplex*)D.get(), elementsCD);
+                    stream << std::endl;
+                }
             }
         }
     }
@@ -414,6 +577,19 @@ namespace hiptensor
 
             auto computeType = convertToComputeType(testType[4]);
 
+            /*
+             * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha.
+             * ```
+             * alphaF = hiptensor::readVal<float>(
+             *      alpha, convertToComputeType(HipDataType_v<typename Traits::ComputeDataT>));
+             * ```
+             * Hence, the `alpha` and `bete` need to point to a ComputeData value
+             */
+            ScalarData alphaBuf;
+            ScalarData betaBuf;
+            writeVal(&alphaBuf, computeType, ScalarData(computeType, alpha[0], alpha[1]));
+            writeVal(&betaBuf, computeType, ScalarData(computeType, beta[0], beta[1]));
+
             CHECK_HIPTENSOR_ERROR(
                 hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize));
 
@@ -421,20 +597,21 @@ namespace hiptensor
 
             CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle,
                                                        &plan,
-                                                       (void*)&alpha,
+                                                       (void*)&alphaBuf,
                                                        resource->deviceA().get(),
                                                        resource->deviceB().get(),
-                                                       (void*)&beta,
+                                                       (void*)&betaBuf,
                                                        resource->deviceC().get(),
                                                        resource->deviceD().get(),
                                                        workspace,
                                                        worksize,
                                                        0 /* stream */));
 
-            CHECK_HIPTENSOR_ERROR(hiptensorContractionReference((void*)&alpha,
+            CHECK_HIPTENSOR_ERROR(hiptensorContractionReference(&plan,
+                                                                (void*)&alphaBuf,
                                                                 resource->hostA().get(),
                                                                 resource->hostB().get(),
-                                                                (void*)&beta,
+                                                                (void*)&betaBuf,
                                                                 resource->hostC().get(),
                                                                 resource->hostD().get(),
                                                                 a_ms_ks.mLengths,
@@ -451,24 +628,47 @@ namespace hiptensor
                                                                 DDataType,
                                                                 workspace));
 
-            size_t elementsCD = std::accumulate(c_ms_ns.mLengths.begin(),
-                                                c_ms_ns.mLengths.end(),
+            size_t elementsCD = std::accumulate(d_ms_ns.mLengths.begin(),
+                                                d_ms_ns.mLengths.end(),
                                                 size_t{1},
                                                 std::multiplies<size_t>());
 
-            int  sizeD = elementsCD * ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double));
+            int  sizeD     = elementsCD * hipDataTypeSize(DDataType);
             auto reference = resource->allocDevice(sizeD);
             resource->copyData(reference, resource->hostD(), sizeD);
 
-            if(DDataType == HIP_R_32F)
+            if(DDataType == HIP_R_16F)
+            {
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(),
+                                                         (_Float16*)reference.get(),
+                                                         elementsCD,
+                                                         computeType);
+            }
+            else if(DDataType == HIP_R_16BF)
+            {
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<hip_bfloat16>(
+                        (hip_bfloat16*)resource->deviceD().get(),
+                        (hip_bfloat16*)reference.get(),
+                        elementsCD,
+                        computeType);
+            }
+            else if(DDataType == HIP_R_32F || DDataType == HIP_C_32F)
             {
-                std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<float>(
-                    (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD);
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<float>((float*)resource->deviceD().get(),
+                                                      (float*)reference.get(),
+                                                      elementsCD,
+                                                      computeType);
             }
-            else if(DDataType == HIP_R_64F)
+            else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F)
             {
-                std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<double>(
-                    (double*)resource->deviceD().get(), (double*)reference.get(), elementsCD);
+                std::tie(mValidationResult, mMaxRelativeError)
+                    = compareEqualLaunchKernel<double>((double*)resource->deviceD().get(),
+                                                       (double*)reference.get(),
+                                                       elementsCD,
+                                                       computeType);
             }
 
             EXPECT_TRUE(mValidationResult) << "Max relative error: " << mMaxRelativeError;
diff --git a/test/01_contraction/contraction_test_params.hpp b/test/01_contraction/contraction_test_params.hpp
index 29c4aa1b..4db4ebc1 100644
--- a/test/01_contraction/contraction_test_params.hpp
+++ b/test/01_contraction/contraction_test_params.hpp
@@ -49,8 +49,8 @@ namespace hiptensor
 
         using LengthsT = std::vector<std::size_t>;
         using StridesT = std::vector<std::size_t>;
-        using AlphaT   = double;
-        using BetaT    = double;
+        using AlphaT   = std::vector<double>;
+        using BetaT    = std::vector<double>;
 
     public:
         std::vector<TestTypesT>& dataTypes()
diff --git a/test/02_permutation/CMakeLists.txt b/test/02_permutation/CMakeLists.txt
index 4334901c..bb2796ea 100644
--- a/test/02_permutation/CMakeLists.txt
+++ b/test/02_permutation/CMakeLists.txt
@@ -29,7 +29,10 @@ set(PermutationCommonSources ${HIPTENSOR_COMMON_TEST_SOURCES}
 
 #  tests
 set (PermutationTestSources ${PermutationCommonSources}
-                                    ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/permutation_cpu_impl_test.cpp
+    )
+
 set (PermutationTestConfig  ${CMAKE_CURRENT_SOURCE_DIR}/configs/test_params.yaml)
 add_hiptensor_test(permutation_test ${PermutationTestConfig}  ${PermutationTestSources})
 
diff --git a/test/02_permutation/permutation_cpu_impl_test.cpp b/test/02_permutation/permutation_cpu_impl_test.cpp
new file mode 100644
index 00000000..5a885f0b
--- /dev/null
+++ b/test/02_permutation/permutation_cpu_impl_test.cpp
@@ -0,0 +1,163 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <hiptensor/hiptensor.hpp>
+
+#include "data_types.hpp"
+#include "logger.hpp"
+#include "permutation/permutation_cpu_reference.hpp"
+#include "permutation_test.hpp"
+#include "utils.hpp"
+#include "llvm/hiptensor_options.hpp"
+
+template <typename floatTypeA, typename floatTypeB, typename floatTypeCompute>
+auto permuteWithCpu(hipDataType typeA, hipDataType typeB, hipDataType typeCompute)
+{
+    std::vector<int> modeA{'w', 'h', 'c', 'n'};
+    std::vector<int> modeB{'c', 'n', 'h', 'w'};
+    int              nmodeA = modeA.size();
+    int              nmodeB = modeB.size();
+
+    std::unordered_map<int, int64_t> extent;
+    extent['h'] = 2;
+    extent['w'] = 3;
+    extent['c'] = 4;
+    extent['n'] = 5;
+
+    std::vector<int64_t> extentA;
+    for(auto mode : modeA)
+    {
+        extentA.push_back(extent[mode]);
+    }
+    std::vector<int64_t> extentB;
+    for(auto mode : modeB)
+    {
+        extentB.push_back(extent[mode]);
+    }
+
+    /**********************
+     * Allocating data
+     **********************/
+
+    size_t elementsA = 1;
+    for(auto mode : modeA)
+    {
+        elementsA *= extent[mode];
+    }
+    size_t elementsB = 1;
+    for(auto mode : modeB)
+    {
+        elementsB *= extent[mode];
+    }
+
+    size_t sizeA = sizeof(floatTypeA) * elementsA;
+    size_t sizeB = sizeof(floatTypeB) * elementsB;
+
+    std::vector<floatTypeA> aArray(elementsA);
+    std::vector<floatTypeB> bArray(elementsB);
+    std::iota(aArray.begin(), aArray.end(), 0);
+
+#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+    std::vector<floatTypeB> referenceArray
+        = {0.,    12.6,  25.2,  37.8,  50.4,  63.,   75.6,  88.2,  100.8, 113.4, 126.,  138.6,
+           151.2, 163.8, 176.4, 189.,  201.6, 214.2, 226.8, 239.4, 6.3,   18.9,  31.5,  44.1,
+           56.7,  69.3,  81.9,  94.5,  107.1, 119.7, 132.3, 144.9, 157.5, 170.1, 182.7, 195.3,
+           207.9, 220.5, 233.1, 245.7, 2.1,   14.7,  27.3,  39.9,  52.5,  65.1,  77.7,  90.3,
+           102.9, 115.5, 128.1, 140.7, 153.3, 165.9, 178.5, 191.1, 203.7, 216.3, 228.9, 241.5,
+           8.4,   21.,   33.6,  46.2,  58.8,  71.4,  84.,   96.6,  109.2, 121.8, 134.4, 147.,
+           159.6, 172.2, 184.8, 197.4, 210.,  222.6, 235.2, 247.8, 4.2,   16.8,  29.4,  42.,
+           54.6,  67.2,  79.8,  92.4,  105.,  117.6, 130.2, 142.8, 155.4, 168.,  180.6, 193.2,
+           205.8, 218.4, 231.,  243.6, 10.5,  23.1,  35.7,  48.3,  60.9,  73.5,  86.1,  98.7,
+           111.3, 123.9, 136.5, 149.1, 161.7, 174.3, 186.9, 199.5, 212.1, 224.7, 237.3, 249.9};
+#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+    std::vector<floatTypeB> referenceArray
+        = {0.,   84.,   168.,  42.,  126.,  210.,  2.1,  86.1,  170.1, 44.1, 128.1, 212.1,
+           4.2,  88.2,  172.2, 46.2, 130.2, 214.2, 6.3,  90.3,  174.3, 48.3, 132.3, 216.3,
+           8.4,  92.4,  176.4, 50.4, 134.4, 218.4, 10.5, 94.5,  178.5, 52.5, 136.5, 220.5,
+           12.6, 96.6,  180.6, 54.6, 138.6, 222.6, 14.7, 98.7,  182.7, 56.7, 140.7, 224.7,
+           16.8, 100.8, 184.8, 58.8, 142.8, 226.8, 18.9, 102.9, 186.9, 60.9, 144.9, 228.9,
+           21.,  105.,  189.,  63.,  147.,  231.,  23.1, 107.1, 191.1, 65.1, 149.1, 233.1,
+           25.2, 109.2, 193.2, 67.2, 151.2, 235.2, 27.3, 111.3, 195.3, 69.3, 153.3, 237.3,
+           29.4, 113.4, 197.4, 71.4, 155.4, 239.4, 31.5, 115.5, 199.5, 73.5, 157.5, 241.5,
+           33.6, 117.6, 201.6, 75.6, 159.6, 243.6, 35.7, 119.7, 203.7, 77.7, 161.7, 245.7,
+           37.8, 121.8, 205.8, 79.8, 163.8, 247.8, 39.9, 123.9, 207.9, 81.9, 165.9, 249.9};
+
+#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR
+
+    const floatTypeCompute alphaValue = 2.1f;
+    hiptensorHandle_t*     handle;
+    CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle));
+    hiptensorTensorDescriptor_t descA;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(
+        handle, &descA, nmodeA, extentA.data(), NULL /* stride */, typeA, HIPTENSOR_OP_IDENTITY));
+
+    hiptensorTensorDescriptor_t descB;
+    CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(
+        handle, &descB, nmodeB, extentB.data(), NULL /* stride */, typeB, HIPTENSOR_OP_IDENTITY));
+
+    hiptensor::detail::permuteByCpu(&alphaValue,
+                                    aArray.data(),
+                                    &descA,
+                                    modeA.data(),
+                                    bArray.data(),
+                                    &descB,
+                                    modeB.data(),
+                                    typeCompute);
+    return compareEqual(referenceArray.data(),
+                        bArray.data(),
+                        bArray.size(),
+                        hiptensor::convertToComputeType(typeCompute),
+                        10);
+}
+
+TEST(PermutationCpuImplTest, CompareF32ResultWithReference)
+{
+    typedef float floatTypeA;
+    typedef float floatTypeB;
+    typedef float floatTypeCompute;
+
+    hipDataType typeA       = HIP_R_32F;
+    hipDataType typeB       = HIP_R_32F;
+    hipDataType typeCompute = HIP_R_32F;
+
+    auto [result, maxRelativeError]
+        = permuteWithCpu<floatTypeA, floatTypeB, floatTypeCompute>(typeA, typeB, typeCompute);
+    EXPECT_TRUE(result) << "max_relative_error: " << maxRelativeError;
+}
+
+TEST(PermutationCpuImplTest, CompareF16ResultWithReference)
+{
+    typedef _Float16 floatTypeA;
+    typedef _Float16 floatTypeB;
+    typedef _Float16 floatTypeCompute;
+
+    hipDataType typeA       = HIP_R_16F;
+    hipDataType typeB       = HIP_R_16F;
+    hipDataType typeCompute = HIP_R_16F;
+
+    auto [result, maxRelativeError]
+        = permuteWithCpu<floatTypeA, floatTypeB, floatTypeCompute>(typeA, typeB, typeCompute);
+    EXPECT_TRUE(result) << "max_relative_error: " << maxRelativeError;
+}
diff --git a/test/02_permutation/permutation_resource.cpp b/test/02_permutation/permutation_resource.cpp
index 1f448ff8..6acd7577 100644
--- a/test/02_permutation/permutation_resource.cpp
+++ b/test/02_permutation/permutation_resource.cpp
@@ -72,7 +72,7 @@ namespace hiptensor
             mCurrentAllocByte = requiredMemorySize;
             needFillData      = true;
         }
-        else if(mCurrentDataType != dataType)
+        if(mCurrentDataType != dataType || mCurrentMatrixElement < requiredElementCount)
         {
             needFillData = true;
         }
diff --git a/test/02_permutation/permutation_test.cpp b/test/02_permutation/permutation_test.cpp
index cfadf5c0..078c78a4 100644
--- a/test/02_permutation/permutation_test.cpp
+++ b/test/02_permutation/permutation_test.cpp
@@ -257,7 +257,8 @@ namespace hiptensor
                 std::tie(mValidationResult, mMaxRelativeError)
                     = compareEqualLaunchKernel<float>((float*)resource->deviceB().get(),
                                                       (float*)resource->deviceReference().get(),
-                                                      resource->getCurrentMatrixElement());
+                                                      resource->getCurrentMatrixElement(),
+                                                      convertToComputeType(computeDataType));
             }
             else if(abDataType == HIP_R_16F)
             {
@@ -273,7 +274,8 @@ namespace hiptensor
                 std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>(
                     (_Float16*)resource->deviceB().get(),
                     (_Float16*)resource->deviceReference().get(),
-                    resource->getCurrentMatrixElement());
+                    resource->getCurrentMatrixElement(),
+                    convertToComputeType(computeDataType));
             }
         }
 
diff --git a/test/device/common.hpp b/test/device/common.hpp
index f961abc1..283a9035 100644
--- a/test/device/common.hpp
+++ b/test/device/common.hpp
@@ -72,8 +72,21 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed)
 
     if(index < elementSize)
     {
-        auto value  = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize;
-        data[index] = static_cast<DataType>(value);
+        if constexpr(std::is_same_v<DataType, hipFloatComplex>)
+        {
+            auto value  = (float(index / float(RAND_MAX) - 0.5) * 100) / elementSize;
+            data[index] = make_hipFloatComplex(value, value);
+        }
+        else if constexpr(std::is_same_v<DataType, hipDoubleComplex>)
+        {
+            auto value  = (double(index / double(RAND_MAX) - 0.5) * 100) / elementSize;
+            data[index] = make_hipDoubleComplex(value, value);
+        }
+        else
+        {
+            auto value  = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize;
+            data[index] = static_cast<DataType>(value);
+        }
     }
 }
 
diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp
index 46f4c43e..8b504b01 100644
--- a/test/llvm/yaml_parser_config.cpp
+++ b/test/llvm/yaml_parser_config.cpp
@@ -92,6 +92,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(hiptensorOperator_t)
 LLVM_YAML_IS_SEQUENCE_VECTOR(hiptensorWorksizePreference_t)
 LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<hipDataType>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<std::size_t>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<double>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(AlphaT)
 LLVM_YAML_IS_SEQUENCE_VECTOR(BetaT)
 
@@ -110,8 +111,11 @@ namespace llvm
             static void enumeration(IO& io, hipDataType& value)
             {
                 io.enumCase(value, "HIP_R_16F", HIP_R_16F);
+                io.enumCase(value, "HIP_R_16BF", HIP_R_16BF);
                 io.enumCase(value, "HIP_R_32F", HIP_R_32F);
                 io.enumCase(value, "HIP_R_64F", HIP_R_64F);
+                io.enumCase(value, "HIP_C_32F", HIP_C_32F);
+                io.enumCase(value, "HIP_C_64F", HIP_C_64F);
                 io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE);
             }
         };
@@ -226,10 +230,10 @@ namespace llvm
                 io.mapRequired("Algorithm Types", doc.algorithms());
                 io.mapRequired("Operators", doc.operators());
                 io.mapRequired("Worksize Prefs", doc.workSizePrefrences());
-                io.mapRequired("Alphas", (std::vector<AlphaT>&)(doc.alphas()));
+                io.mapOptional("Alphas", (std::vector<std::vector<double>>&)(doc.alphas()));
                 io.mapOptional("Betas",
-                               (std::vector<BetaT>&)(doc.betas()),
-                               std::vector<BetaT>(doc.alphas().size(), BetaT(0)));
+                               (std::vector<std::vector<double>>&)(doc.betas()),
+                               std::vector<std::vector<double>>(doc.alphas().size()));
                 io.mapRequired("Lengths", doc.problemLengths());
 
                 // Default values for optional values
@@ -256,6 +260,13 @@ namespace llvm
                     return "Error: Empty Alphas";
                 }
 
+                if(std::any_of(doc.alphas().cbegin(), doc.alphas().cend(), [](auto&& alpha) {
+                       return alpha.size() > 2 || alpha.size() <= 0;
+                   }))
+                {
+                    return "Error: invalid Alpha";
+                }
+
                 if(doc.betas().size() > 0 && doc.betas().size() != doc.alphas().size())
                 {
                     return "Error: Alphas and betas must have same size";
diff --git a/test/utils.hpp b/test/utils.hpp
index 1f7ece44..fc999738 100644
--- a/test/utils.hpp
+++ b/test/utils.hpp
@@ -41,9 +41,9 @@
 #include <hiptensor/hiptensor.hpp>
 #include <hiptensor/hiptensor_types.hpp>
 #include <hiptensor/internal/hiptensor_utility.hpp>
+#include <hiptensor/internal/types.hpp>
 
 #include "device/common.hpp"
-#include "types.hpp"
 
 #define HIPTENSOR_FREE_DEVICE(ptr)     \
     if(ptr != nullptr)                 \
@@ -57,6 +57,59 @@
         CHECK_HIP_ERROR(hipHostFree(ptr)); \
     }
 
+inline double getEpsilon(hiptensorComputeType_t id)
+{
+    auto toDouble = [](auto const& val) { return static_cast<double>(static_cast<float>(val)); };
+
+    if(id == HIPTENSOR_COMPUTE_16F)
+    {
+        return toDouble(std::numeric_limits<_Float16>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_16BF)
+    {
+        return toDouble(std::numeric_limits<hip_bfloat16>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_32F)
+    {
+        return toDouble(std::numeric_limits<float>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_64F)
+    {
+        return toDouble(std::numeric_limits<double>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_8U)
+    {
+        return 0;
+    }
+    else if(id == HIPTENSOR_COMPUTE_8I)
+    {
+        return 0;
+    }
+    else if(id == HIPTENSOR_COMPUTE_32U)
+    {
+        return 0;
+    }
+    else if(id == HIPTENSOR_COMPUTE_32I)
+    {
+        return 0;
+    }
+    else if(id == HIPTENSOR_COMPUTE_C32F)
+    {
+        return toDouble(std::numeric_limits<float>::epsilon());
+    }
+    else if(id == HIPTENSOR_COMPUTE_C64F)
+    {
+        return toDouble(std::numeric_limits<double>::epsilon());
+    }
+    else
+    {
+#if !NDEBUG
+        std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl;
+#endif // !NDEBUG
+        return 0;
+    }
+}
+
 inline bool isF32Supported()
 {
     hipDevice_t     mHandle;
@@ -137,10 +190,11 @@ __host__ static inline void
 }
 
 template <typename DDataType>
-std::pair<bool, double> compareEqual(DDataType const* deviceD,
-                                     DDataType const* hostD,
-                                     std::size_t      elementsD,
-                                     double           tolerance = 100.0)
+std::pair<bool, double> compareEqual(DDataType const*       deviceD,
+                                     DDataType const*       hostD,
+                                     std::size_t            elementsD,
+                                     hiptensorComputeType_t computeType,
+                                     double                 tolerance = 100.0)
 {
     bool   retval             = true;
     double max_relative_error = 0.0;
@@ -191,7 +245,7 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
         }
     }
 
-    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
+    auto eps = getEpsilon(computeType);
     if(isInf)
     {
         retval             = false;
@@ -211,10 +265,11 @@ std::pair<bool, double> compareEqual(DDataType const* deviceD,
 }
 
 template <typename DDataType>
-std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
-                                                 DDataType*  hostD,
-                                                 std::size_t elementsD,
-                                                 double      tolerance = 100.0)
+std::pair<bool, double> compareEqualLaunchKernel(DDataType*             deviceD,
+                                                 DDataType*             hostD,
+                                                 std::size_t            elementsD,
+                                                 hiptensorComputeType_t computeType,
+                                                 double                 tolerance = 100.0)
 {
     auto blockDim = dim3(1024, 1, 1);
     auto gridDim  = dim3(ceilDiv(elementsD, blockDim.x), 1, 1);
@@ -276,7 +331,7 @@ std::pair<bool, double> compareEqualLaunchKernel(DDataType*  deviceD,
     auto toDouble
         = [](DDataType const& val) { return static_cast<double>(static_cast<float>(val)); };
 
-    auto eps = toDouble(std::numeric_limits<DDataType>::epsilon());
+    auto eps = getEpsilon(computeType);
     if(isNaN)
     {
         retval           = false;