From 1f69024067217522fb86084e363059112a60c9e3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 30 Nov 2023 18:13:55 +0000 Subject: [PATCH 01/42] Bump rocm-docs-core from 0.28.0 to 0.30.0 in /docs/.sphinx Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.28.0 to 0.30.0. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.28.0...v0.30.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 94103e1a..22885896 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -100,7 +100,7 @@ requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core==0.28.0 +rocm-docs-core==0.30.0 # via -r requirements.in smmap==5.0.0 # via gitdb From fc450d9a612feb91817b8b999c03fa1044695aa7 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 30 Nov 2023 23:05:50 +0000 Subject: [PATCH 02/42] Rename sample of permutation Rename `permutation` to `simple_permutation` to comply with naming conventions. --- samples/02_permutation/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/02_permutation/CMakeLists.txt b/samples/02_permutation/CMakeLists.txt index 68857b54..ab66798c 100644 --- a/samples/02_permutation/CMakeLists.txt +++ b/samples/02_permutation/CMakeLists.txt @@ -26,7 +26,7 @@ # Check whether building within hiptensor context if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) - add_hiptensor_sample(permutation permutation.cpp) + add_hiptensor_sample(simple_permutation permutation.cpp) # If building hipTensor samples as a standalone Cmake project else() add_executable(permutation permutation.cpp) From 9c4c7622a05dfbf8adae976e9be2b494372c439b Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Mon, 20 Nov 2023 22:24:28 +0000 Subject: [PATCH 03/42] Add unittest to for the permute CPU implementation - compare the permute result with reference - test col/row major - test float and _Float16 --- test/02_permutation/CMakeLists.txt | 5 +- .../permutation_cpu_impl_test.cpp | 159 ++++++++++++++++++ 2 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 test/02_permutation/permutation_cpu_impl_test.cpp diff --git a/test/02_permutation/CMakeLists.txt b/test/02_permutation/CMakeLists.txt index 4334901c..bb2796ea 100644 --- a/test/02_permutation/CMakeLists.txt +++ b/test/02_permutation/CMakeLists.txt @@ -29,7 +29,10 @@ set(PermutationCommonSources ${HIPTENSOR_COMMON_TEST_SOURCES} # tests set (PermutationTestSources ${PermutationCommonSources} - ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/permutation_cpu_impl_test.cpp + ) + set (PermutationTestConfig ${CMAKE_CURRENT_SOURCE_DIR}/configs/test_params.yaml) add_hiptensor_test(permutation_test ${PermutationTestConfig} ${PermutationTestSources}) diff --git a/test/02_permutation/permutation_cpu_impl_test.cpp b/test/02_permutation/permutation_cpu_impl_test.cpp new file mode 100644 index 00000000..014dbc61 --- /dev/null +++ b/test/02_permutation/permutation_cpu_impl_test.cpp @@ -0,0 +1,159 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include + +#include "data_types.hpp" +#include "logger.hpp" +#include "permutation/permutation_cpu_reference.hpp" +#include "permutation_test.hpp" +#include "utils.hpp" +#include "llvm/hiptensor_options.hpp" + +template +auto permuteWithCpu(hipDataType typeA, hipDataType typeB, hipDataType typeCompute) +{ + std::vector modeA{'w', 'h', 'c', 'n'}; + std::vector modeB{'c', 'n', 'h', 'w'}; + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + + std::unordered_map extent; + extent['h'] = 2; + extent['w'] = 3; + extent['c'] = 4; + extent['n'] = 5; + + std::vector extentA; + for(auto mode : modeA) + { + extentA.push_back(extent[mode]); + } + std::vector extentB; + for(auto mode : modeB) + { + extentB.push_back(extent[mode]); + } + + /********************** + * Allocating data + **********************/ + + size_t elementsA = 1; + for(auto mode : modeA) + { + elementsA *= extent[mode]; + } + size_t elementsB = 1; + for(auto mode : modeB) + { + elementsB *= extent[mode]; + } + + size_t sizeA = sizeof(floatTypeA) * elementsA; + size_t sizeB = sizeof(floatTypeB) * elementsB; + + std::vector aArray(elementsA); + std::vector bArray(elementsB); + std::iota(aArray.begin(), aArray.end(), 0); + +#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR + std::vector referenceArray + = {0., 12.6, 25.2, 37.8, 50.4, 63., 75.6, 88.2, 100.8, 113.4, 126., 138.6, + 151.2, 163.8, 176.4, 189., 201.6, 214.2, 226.8, 239.4, 6.3, 18.9, 31.5, 44.1, + 56.7, 69.3, 81.9, 94.5, 107.1, 119.7, 132.3, 144.9, 157.5, 170.1, 182.7, 195.3, + 207.9, 220.5, 233.1, 245.7, 2.1, 14.7, 27.3, 39.9, 52.5, 65.1, 77.7, 90.3, + 102.9, 115.5, 128.1, 140.7, 153.3, 165.9, 178.5, 191.1, 203.7, 216.3, 228.9, 241.5, + 8.4, 21., 33.6, 46.2, 58.8, 71.4, 84., 96.6, 109.2, 121.8, 134.4, 147., + 159.6, 172.2, 184.8, 197.4, 210., 222.6, 235.2, 247.8, 4.2, 16.8, 29.4, 42., + 54.6, 67.2, 79.8, 92.4, 105., 117.6, 130.2, 142.8, 155.4, 168., 180.6, 193.2, + 205.8, 218.4, 231., 243.6, 10.5, 23.1, 35.7, 48.3, 60.9, 73.5, 86.1, 98.7, + 111.3, 123.9, 136.5, 149.1, 161.7, 174.3, 186.9, 199.5, 212.1, 224.7, 237.3, 249.9}; +#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR + std::vector referenceArray + = {0., 84., 168., 42., 126., 210., 2.1, 86.1, 170.1, 44.1, 128.1, 212.1, + 4.2, 88.2, 172.2, 46.2, 130.2, 214.2, 6.3, 90.3, 174.3, 48.3, 132.3, 216.3, + 8.4, 92.4, 176.4, 50.4, 134.4, 218.4, 10.5, 94.5, 178.5, 52.5, 136.5, 220.5, + 12.6, 96.6, 180.6, 54.6, 138.6, 222.6, 14.7, 98.7, 182.7, 56.7, 140.7, 224.7, + 16.8, 100.8, 184.8, 58.8, 142.8, 226.8, 18.9, 102.9, 186.9, 60.9, 144.9, 228.9, + 21., 105., 189., 63., 147., 231., 23.1, 107.1, 191.1, 65.1, 149.1, 233.1, + 25.2, 109.2, 193.2, 67.2, 151.2, 235.2, 27.3, 111.3, 195.3, 69.3, 153.3, 237.3, + 29.4, 113.4, 197.4, 71.4, 155.4, 239.4, 31.5, 115.5, 199.5, 73.5, 157.5, 241.5, + 33.6, 117.6, 201.6, 75.6, 159.6, 243.6, 35.7, 119.7, 203.7, 77.7, 161.7, 245.7, + 37.8, 121.8, 205.8, 79.8, 163.8, 247.8, 39.9, 123.9, 207.9, 81.9, 165.9, 249.9}; + +#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR + + const floatTypeCompute alphaValue = 2.1f; + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + hiptensorTensorDescriptor_t descA; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor( + handle, &descA, nmodeA, extentA.data(), NULL /* stride */, typeA, HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t descB; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor( + handle, &descB, nmodeB, extentB.data(), NULL /* stride */, typeB, HIPTENSOR_OP_IDENTITY)); + + hiptensor::detail::permuteByCpu(&alphaValue, + aArray.data(), + &descA, + modeA.data(), + bArray.data(), + &descB, + modeB.data(), + typeCompute); + return compareEqual(referenceArray.data(), bArray.data(), bArray.size(), 10); +} + +TEST(PermutationCpuImplTest, CompareF32ResultWithReference) +{ + typedef float floatTypeA; + typedef float floatTypeB; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_32F; + hipDataType typeB = HIP_R_32F; + hipDataType typeCompute = HIP_R_32F; + + auto [result, maxRelativeError] + = permuteWithCpu(typeA, typeB, typeCompute); + EXPECT_TRUE(result) << "max_relative_error: " << maxRelativeError; +} + +TEST(PermutationCpuImplTest, CompareF16ResultWithReference) +{ + typedef _Float16 floatTypeA; + typedef _Float16 floatTypeB; + typedef _Float16 floatTypeCompute; + + hipDataType typeA = HIP_R_16F; + hipDataType typeB = HIP_R_16F; + hipDataType typeCompute = HIP_R_16F; + + auto [result, maxRelativeError] + = permuteWithCpu(typeA, typeB, typeCompute); + EXPECT_TRUE(result) << "max_relative_error: " << maxRelativeError; +} From bba3217ade0b285d9a1f718f8d74f8766121e63b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:01:47 +0000 Subject: [PATCH 04/42] Bump gitpython from 3.1.35 to 3.1.37 in /docs/.sphinx Bumps [gitpython](https://github.com/gitpython-developers/GitPython) from 3.1.35 to 3.1.37. - [Release notes](https://github.com/gitpython-developers/GitPython/releases) - [Changelog](https://github.com/gitpython-developers/GitPython/blob/main/CHANGES) - [Commits](https://github.com/gitpython-developers/GitPython/compare/3.1.35...3.1.37) --- updated-dependencies: - dependency-name: gitpython dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 22885896..0441a14a 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -40,7 +40,7 @@ fastjsonschema==2.16.3 # via rocm-docs-core gitdb==4.0.10 # via gitpython -gitpython==3.1.35 +gitpython==3.1.37 # via rocm-docs-core idna==3.4 # via requests From 3f7a904e18f5473ef11f992bd5c285b1425b06cb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:02:23 +0000 Subject: [PATCH 05/42] Bump urllib3 from 1.26.15 to 1.26.18 in /docs/.sphinx Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.15 to 1.26.18. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.26.15...1.26.18) --- updated-dependencies: - dependency-name: urllib3 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 22885896..60e980b6 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -143,7 +143,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx typing-extensions==4.5.0 # via pydata-sphinx-theme -urllib3==1.26.15 +urllib3==1.26.18 # via requests wrapt==1.15.0 # via deprecated From 23b46d62952e22de4372bc40ffa1eaf5e59b3a9f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 08:46:44 +0000 Subject: [PATCH 06/42] Bump rocm-docs-core from 0.30.0 to 0.30.1 in /docs/.sphinx Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.30.0 to 0.30.1. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.30.0...v0.30.1) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 22885896..17b81d3d 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -100,7 +100,7 @@ requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core==0.30.0 +rocm-docs-core==0.30.1 # via -r requirements.in smmap==5.0.0 # via gitdb From 852992e891fa54f7e3384485f5f12294f26af385 Mon Sep 17 00:00:00 2001 From: Sam Wu Date: Thu, 7 Dec 2023 11:04:21 -0700 Subject: [PATCH 07/42] Fix spelling in documentation (#155) * Fix spelling in documentation * Use code directive to escape code keywords * Revert "Use code directive to escape code keywords" This reverts commit 7be7e3446830f2d7dd2760f140c7dfa30a246f78. * Disable spellcheck on API Reference Guide * Fix spelling in API Reference Guide hiptensor > hipTensor --- docs/API_Reference_Guide.rst | 7 +++++-- docs/Contributors_Guide.rst | 25 +++++++++++------------ docs/Linux_Install_Guide.rst | 29 ++++++++++++++------------- docs/Programmers_Guide.rst | 39 ++++++++++++++++++------------------ docs/index.rst | 4 ++-- 5 files changed, 53 insertions(+), 51 deletions(-) diff --git a/docs/API_Reference_Guide.rst b/docs/API_Reference_Guide.rst index 551e2ee0..77e86343 100644 --- a/docs/API_Reference_Guide.rst +++ b/docs/API_Reference_Guide.rst @@ -3,15 +3,16 @@ Introduction ************ -hiptensor Data Types +hipTensor Data Types ==================== +.. + hiptensorStatus_t ----------------- .. doxygenenum:: hiptensorStatus_t - hiptensorComputeType_t ---------------------- @@ -160,3 +161,5 @@ hiptensorLoggerForceDisable --------------------------- .. doxygenfunction:: hiptensorLoggerForceDisable + +.. diff --git a/docs/Contributors_Guide.rst b/docs/Contributors_Guide.rst index aeb87211..d75a884b 100644 --- a/docs/Contributors_Guide.rst +++ b/docs/Contributors_Guide.rst @@ -15,8 +15,7 @@ License Agreement Pull-request guidelines ======================= - -Our code contriubtion guidelines closely follows the model of `GitHub +Our code contribution guidelines closely follows the model of `GitHub pull-requests `__. The hipTensor repository follows a workflow which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code. Pull requests should: @@ -30,7 +29,7 @@ The hipTensor repository follows a workflow which dictates a /master branch wher - code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit. -StyleGuide +Style Guide ========== This project follows the `CPP Core @@ -44,7 +43,7 @@ Interface --------- - Library code should use C++17 -- Avoid CamelCase +- Avoid Camel case - This rule applies specifically to publicly visible APIs, but is also encouraged (not mandated) for internal code @@ -52,8 +51,8 @@ Philosophy ---------- - `P.2 `__: - Write in ISO Standard C++14 (especially to support windows, linux and - macos plaforms ) + Write in ISO Standard C++14 (especially to support Windows, Linux and + macOS platforms ) - `P.5 `__: Prefer compile-time checking to run-time checking @@ -105,19 +104,19 @@ will result in different results. To format a file, use: -:: +.. code-block:: - /opt/rocm/llvm/bin/clang-format -style=file -i + /opt/rocm/llvm/bin/clang-format -style=file -i To format all files, run the following script in hipTensor directory: -:: +.. code-block:: - #!/bin/bash - git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i + #!/bin/bash + git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i Also, githooks can be installed to format the code per-commit: -:: +.. code-block:: - ./.githooks/install + ./.githooks/install diff --git a/docs/Linux_Install_Guide.rst b/docs/Linux_Install_Guide.rst index 47cdc339..ace565c1 100644 --- a/docs/Linux_Install_Guide.rst +++ b/docs/Linux_Install_Guide.rst @@ -104,9 +104,9 @@ Minimum ROCm version support is 5.7. By default, the project is configured as Release mode. -To build only library, run the following comomand : +To build only library, run the following command : - CC=hipcc CXX=hipcc cmake -B . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=OFF + :code:`CC=hipcc CXX=hipcc cmake -B . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=OFF` Here are some other example project configurations: @@ -116,30 +116,30 @@ Here are some other example project configurations: +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ | Configuration | Command | +===================================+====================================================================================================================+ -| Basic | CC=hipcc CXX=hipcc cmake -B . | +| Basic | :code:`CC=hipcc CXX=hipcc cmake -B .` | +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ -| Targeting gfx908 | CC=hipcc CXX=hipcc cmake -B . -DAMDGPU_TARGETS=gfx908:xnack- | +| Targeting gfx908 | :code:`CC=hipcc CXX=hipcc cmake -B . -DAMDGPU_TARGETS=gfx908:xnack-` | +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ -| Debug build | CC=hipcc CXX=hipcc cmake -B . -DCMAKE_BUILD_TYPE=Debug | +| Debug build | :code:`CC=hipcc CXX=hipcc cmake -B . -DCMAKE_BUILD_TYPE=Debug` | +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ After configuration, build with - cmake --build -- -j + :code:`cmake --build -- -j` Build library + samples ^^^^^^^^^^^^^^^^^^^^^^^ -To build library and samples, run the following comomand : +To build library and samples, run the following command: - CC=hipcc CXX=hipcc cmake -B . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=ON + :code:`CC=hipcc CXX=hipcc cmake -B . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=ON` After configuration, build with - cmake --build -- -j + :code:`cmake --build -- -j` -The samples folder in contains executables in the table below. +The samples folder in :code:`` contains executables in the table below. =================================== =================================================================================== executable name description @@ -154,13 +154,13 @@ Build library + tests To build library and tests, run the following command : - CC=hipcc CXX=hipcc cmake -B . + :code:`CC=hipcc CXX=hipcc cmake -B .` After configuration, build with - cmake --build -- -j + :code:`cmake --build -- -j` -The tests in contains executables in the table below. +The tests in `` contains executables in the table below. ====================================== =================================================================================== executable name description @@ -177,6 +177,7 @@ Build library + Documentation Run the steps below to build documentation locally. +.. code-block:: cd docs sudo apt-get update @@ -191,4 +192,4 @@ Run the steps below to build documentation locally. pdflatex hiptensor.tex -Generates hiptensor.pdf here +Generates :code:`hiptensor.pdf` here diff --git a/docs/Programmers_Guide.rst b/docs/Programmers_Guide.rst index 460bb970..1eaf9adf 100644 --- a/docs/Programmers_Guide.rst +++ b/docs/Programmers_Guide.rst @@ -1,4 +1,3 @@ - =================== Programmer's Guide =================== @@ -17,13 +16,13 @@ The hipTensor code is split into four major parts: The `library` directory ^^^^^^^^^^^^^^^^^^^^^^^ -library/include/hiptensor/ +`library/include/hiptensor/` ''''''''''''''''''''''''''' Contains C++ include files for the hipTensor API. These files also contain Doxygen comments that document the API. -library/include/hiptensor/internal +`library/include/hiptensor/internal` '''''''''''''''''''''''''''''''''' Internal include files for: @@ -31,58 +30,58 @@ Internal include files for: - Utility Code - Generate Tensor Utility -library/src/ +`library/src/` '''''''''''' Contains logger, device and performance functions. -library/src/contraction/ +`library/src/contraction/` '''''''''''''''''''''''' Contains hipTensor core composable kernel header functions and contraction initialization functions. -library/src/contraction/device +`library/src/contraction/device` '''''''''''''''''''''''''''''' Contains hipTensor Bilinear and Scale instance functions The `samples` directory ^^^^^^^^^^^^^^^^^^^^^^^ -01_contraction/simple_bilinear_contraction_f32.cpp +`01_contraction/simple_bilinear_contraction_f32.cpp` '''''''''''''''''''''''''''''''''''''''''''''''''' -sample code for calling bilinear contraction for fp32 input, output and compute types +sample code for calling bilinear contraction for :code:`fp32` input, output and compute types -01_contraction/simple_scale_contraction_f32.cpp +`01_contraction/simple_scale_contraction_f32.cpp` ''''''''''''''''''''''''''''''''''''''''''''''' -sample code for calling scale contraction for fp32 input, output and compute types +sample code for calling scale contraction for :code:`fp32` input, output and compute types The `test` directory ^^^^^^^^^^^^^^^^^^^^^^^ -00_unit/logger +`00_unit/logger` '''''''''''''' Test code for testing logger API Functions of hipTensor -01_contraction/bilinear_contraction_f32 +`01_contraction/bilinear_contraction_f32` ''''''''''''''''''''''''''''''''''''''' Test code for testing the bilinear contraction functionality and log metrics for F32 types. -01_contraction/bilinear_contraction_f64 +`01_contraction/bilinear_contraction_f64` ''''''''''''''''''''''''''''''''''''''' Test code for testing the bilinear contraction functionality and log metrics for F64 types. -01_contraction/scale_contraction_f32 +`01_contraction/scale_contraction_f32` '''''''''''''''''''''''''''''''''''' Test code for testing the scale contraction functionality and log metrics for F32 types. -01_contraction/scale_contraction_f64 +`01_contraction/scale_contraction_f64` '''''''''''''''''''''''''''''''''''' Test code for testing the scale contraction functionality and log metrics for F64 types. @@ -90,11 +89,11 @@ Test code for testing the scale contraction functionality and log metrics for F6 Infrastructure ^^^^^^^^^^^^^^ -- CMake is used to build and package hipTensor. There are CMakeLists.txt files throughout the code. -- Doxygen/Breathe/Sphinx/ReadTheDocs are used to produce documentation. Content for the documentation is from: +- CMake is used to build and package hipTensor. There are :code:`CMakeLists.txt` files throughout the code. +- `Doxygen/Breathe/Sphinx/ReadtheDocs` are used to produce documentation. Content for the documentation is from: - - Doxygen comments in include files in the directory library/include - - files in the directory docs/ + - Doxygen comments in include files in the directory :code:`library/include` + - files in the directory :code:`docs/` - Jenkins is used to automate Continuous Integration testing. -- clang-format is used to format C++ code. +- :code:`clang-format` is used to format C++ code. diff --git a/docs/index.rst b/docs/index.rst index 566a00e5..ba5e1cb7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,6 +1,6 @@ ============================================================================ -hiptensor: A High-Performance HIP Library For Tensor Primitives +hipTensor: A High-Performance HIP Library For Tensor Primitives ============================================================================ -hiptensor is AMD's C++ library for accelerating tensor primitives based on the +hipTensor is AMD's C++ library for accelerating tensor primitives based on the composable kernel library, through general purpose kernel languages, like HIP C++. From c5fbcec9afdfe109b1ba1a15d74d819beec2e6fe Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 22 Nov 2023 16:21:33 +0000 Subject: [PATCH 08/42] Add support to f16 and bf16 to contraction - Support _Float16 - Support hip_bfloat16 - Add unit test of _Float16 and hip_bfloat16 - Add sample of _Float16 and hip_bfloat16 --- .../hiptensor/internal}/config.hpp | 0 .../hiptensor/internal/hiptensor_utility.hpp | 1 + .../hiptensor/internal}/native_types.hpp | 0 .../hiptensor/internal}/native_types_impl.hpp | 0 .../hiptensor/internal}/type_traits.hpp | 0 .../hiptensor/internal}/types.hpp | 0 .../hiptensor/internal}/types_ext.hpp | 0 .../hiptensor/internal}/xfloat32.hpp | 0 .../contraction_cpu_reference_impl.hpp | 19 +- .../contraction_cpu_reference_instances.cpp | 56 +++ .../contraction/contraction_meta_traits.hpp | 47 ++- .../src/contraction/contraction_selection.cpp | 280 +++++++++++++- .../src/contraction/contraction_solution.hpp | 3 +- .../contraction/contraction_solution_impl.hpp | 6 +- .../contraction_solution_instances.cpp | 57 +++ library/src/contraction/device/CMakeLists.txt | 56 ++- ...16_bf16_bf16_compute_f32_kknn_instance.cpp | 62 ++++ ...16_bf16_bf16_compute_f32_knnn_instance.cpp | 62 ++++ ...16_bf16_bf16_compute_f32_mknn_instance.cpp | 62 ++++ ...16_bf16_bf16_compute_f32_mnnn_instance.cpp | 62 ++++ ..._f16_f16_f16_compute_f32_kknn_instance.cpp | 62 ++++ ..._f16_f16_f16_compute_f32_knnn_instance.cpp | 62 ++++ ..._f16_f16_f16_compute_f32_mknn_instance.cpp | 62 ++++ ..._f16_f16_f16_compute_f32_mnnn_instance.cpp | 62 ++++ ...f16_bf16_bf16_compute_f32_kkn_instance.cpp | 62 ++++ ...f16_bf16_bf16_compute_f32_knn_instance.cpp | 62 ++++ ...f16_bf16_bf16_compute_f32_mkn_instance.cpp | 62 ++++ ...f16_bf16_bf16_compute_f32_mnn_instance.cpp | 62 ++++ ...e_f16_f16_f16_compute_f32_kkn_instance.cpp | 62 ++++ ...e_f16_f16_f16_compute_f32_knn_instance.cpp | 62 ++++ ...e_f16_f16_f16_compute_f32_mkn_instance.cpp | 62 ++++ ...e_f16_f16_f16_compute_f32_mnn_instance.cpp | 62 ++++ ...hpp => hiptensor_contraction_bilinear.hpp} | 0 ...le.hpp => hiptensor_contraction_scale.hpp} | 0 .../src/contraction/hiptensor_contraction.cpp | 11 - library/src/hiptensor.cpp | 3 +- samples/01_contraction/CMakeLists.txt | 15 + .../simple_bilinear_contraction_bf16.cpp | 342 ++++++++++++++++++ .../simple_bilinear_contraction_f16.cpp | 342 ++++++++++++++++++ .../simple_scale_contraction_bf16.cpp | 334 +++++++++++++++++ .../simple_scale_contraction_f16.cpp | 334 +++++++++++++++++ .../configs/bilinear_test_params.yaml | 2 + .../configs/scale_test_params.yaml | 2 + test/01_contraction/contraction_test.cpp | 109 +++++- test/device/common.hpp | 2 +- test/llvm/yaml_parser_config.cpp | 1 + test/utils.hpp | 2 +- 47 files changed, 2944 insertions(+), 72 deletions(-) rename library/{src/include => include/hiptensor/internal}/config.hpp (100%) rename library/{src/include => include/hiptensor/internal}/native_types.hpp (100%) rename library/{src/include => include/hiptensor/internal}/native_types_impl.hpp (100%) rename library/{src/include => include/hiptensor/internal}/type_traits.hpp (100%) rename library/{src/include => include/hiptensor/internal}/types.hpp (100%) rename library/{src/include => include/hiptensor/internal}/types_ext.hpp (100%) rename library/{src/include => include/hiptensor/internal}/xfloat32.hpp (100%) create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp rename library/src/contraction/device/{contraction_bilinear.hpp => hiptensor_contraction_bilinear.hpp} (100%) rename library/src/contraction/device/{contraction_scale.hpp => hiptensor_contraction_scale.hpp} (100%) create mode 100644 samples/01_contraction/simple_bilinear_contraction_bf16.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f16.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_bf16.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_f16.cpp diff --git a/library/src/include/config.hpp b/library/include/hiptensor/internal/config.hpp similarity index 100% rename from library/src/include/config.hpp rename to library/include/hiptensor/internal/config.hpp diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp index f2df2dd2..c386bbe0 100644 --- a/library/include/hiptensor/internal/hiptensor_utility.hpp +++ b/library/include/hiptensor/internal/hiptensor_utility.hpp @@ -31,6 +31,7 @@ #include #include "../hiptensor_types.hpp" +#include "types_ext.hpp" #ifndef CHECK_HIP_ERROR #define CHECK_HIP_ERROR(expression) \ diff --git a/library/src/include/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp similarity index 100% rename from library/src/include/native_types.hpp rename to library/include/hiptensor/internal/native_types.hpp diff --git a/library/src/include/native_types_impl.hpp b/library/include/hiptensor/internal/native_types_impl.hpp similarity index 100% rename from library/src/include/native_types_impl.hpp rename to library/include/hiptensor/internal/native_types_impl.hpp diff --git a/library/src/include/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp similarity index 100% rename from library/src/include/type_traits.hpp rename to library/include/hiptensor/internal/type_traits.hpp diff --git a/library/src/include/types.hpp b/library/include/hiptensor/internal/types.hpp similarity index 100% rename from library/src/include/types.hpp rename to library/include/hiptensor/internal/types.hpp diff --git a/library/src/include/types_ext.hpp b/library/include/hiptensor/internal/types_ext.hpp similarity index 100% rename from library/src/include/types_ext.hpp rename to library/include/hiptensor/internal/types_ext.hpp diff --git a/library/src/include/xfloat32.hpp b/library/include/hiptensor/internal/xfloat32.hpp similarity index 100% rename from library/src/include/xfloat32.hpp rename to library/include/hiptensor/internal/xfloat32.hpp diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index 673f6dff..ac4fc20d 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -57,6 +57,7 @@ namespace hiptensor typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, + typename ComputeDataType = ADataType, ck::enable_if_t = false> @@ -70,7 +71,8 @@ namespace hiptensor EDataType, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation> + CDEElementwiseOperation, + ComputeDataType> { using BaseArgument = ck::tensor_operation::device::BaseArgument; using BaseInvoker = ck::tensor_operation::device::BaseInvoker; @@ -324,7 +326,8 @@ namespace hiptensor typename AccumDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + CDEElementwiseOperation, + ComputeDataType>> : public MetaTraits< ck::tensor_operation::device::DeviceContractionMultipleD> + CDEElementwiseOperation, + ComputeDataType>> { }; @@ -359,7 +364,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType = ADataType> auto enumerateReferenceSolutions() { using ReferenceOp = ReferenceContraction_M2_N2_K2; + CDEElementwiseOperation, + ComputeDataType>; auto solution = std::make_unique>( std::make_unique()); diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 106dd5ff..146d2721 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -32,6 +32,34 @@ namespace hiptensor ContractionCpuReferenceInstances::ContractionCpuReferenceInstances() { // Register all the solutions exactly once + // Bilinear f16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + _Float16, + _Float16, + ck::Tuple<_Float16>, + _Float16, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear bf16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f32 registerSolutions( enumerateReferenceSolutions<2, @@ -58,6 +86,34 @@ namespace hiptensor ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Bilinear>()); + // Scale f16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + _Float16, + _Float16, + ck::Tuple<>, + _Float16, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale bf16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple<>, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + // Scale f32 registerSolutions( enumerateReferenceSolutions<2, diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index 4fa7acf7..ab158f96 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -49,7 +49,8 @@ namespace hiptensor typename DsDataType, typename EDataType, typename AElementwiseOperation, - typename BElementwiseOperation> + typename BElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + ck::tensor_operation::element_wise::Bilinear, + ComputeDataType>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; - using ADataT = ADataType; - using BDataT = BDataType; - using DDataT = DsDataType; - using EDataT = EDataType; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Bilinear; + using ADataT + = std::conditional_t, hip_bfloat16, ADataType>; + using BDataT + = std::conditional_t, hip_bfloat16, BDataType>; + using DDataT + = std::conditional_t, hip_bfloat16, DsDataType>; + using EDataT + = std::conditional_t, hip_bfloat16, EDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Bilinear; }; // Partial specialize for Scale contraction @@ -82,7 +88,8 @@ namespace hiptensor typename BDataType, typename EDataType, typename AElementwiseOperation, - typename BElementwiseOperation> + typename BElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + ck::tensor_operation::element_wise::Scale, + ComputeDataType>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; - using ADataT = ADataType; - using BDataT = BDataType; - using DDataT = NoneType; - using EDataT = EDataType; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Scale; + using ADataT + = std::conditional_t, hip_bfloat16, ADataType>; + using BDataT + = std::conditional_t, hip_bfloat16, BDataType>; + using DDataT = NoneType; + using EDataT + = std::conditional_t, hip_bfloat16, EDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Scale; }; } // namespace hiptensor diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index aaa624f6..1b2cf92e 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -150,6 +150,192 @@ namespace hiptensor } } + // test + template <> + struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::SCALE> + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 7255639152084218514; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::BILINEAR> + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // select unique_id + unique_id = 7255639152084218514; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 8689089455041651212; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // select unique_id + unique_id = 8689089455041651212; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + // end test + template <> struct ActorCriticSelection { @@ -1418,7 +1604,99 @@ namespace hiptensor std::vector const& e_ms_ns_strides, const uint64_t workspaceSize) { - if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F) + if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F) + { + return ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::SCALE>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F + && typeE == HIP_R_16F) + { + return ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::BILINEAR>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE + && typeE == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF + && typeE == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE + && typeE == HIP_R_32F) { return ActorCriticSelection:: selectWinner(winner, diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp index 0037584e..e76bb351 100644 --- a/library/src/contraction/contraction_solution.hpp +++ b/library/src/contraction/contraction_solution.hpp @@ -147,7 +147,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType> std::vector> enumerateContractionSolutions(); } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 0fb5df9d..5e191441 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -274,7 +274,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType = ADataType> std::vector> enumerateContractionSolutions() { using ContractionOp @@ -287,7 +288,8 @@ namespace hiptensor EDataType, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation>; + CDEElementwiseOperation, + ComputeDataType>; using Factory = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory; diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index fd263a8b..6d481577 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -32,6 +32,35 @@ namespace hiptensor ContractionSolutionInstances::ContractionSolutionInstances() { // Register all the solutions exactly once + + // Bilinear bf16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear f16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + ck::Tuple, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f32 registerSolutions( enumerateContractionSolutions<2, @@ -58,6 +87,34 @@ namespace hiptensor ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Bilinear>()); + // Scale bf16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple<>, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale f16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + ck::Tuple<>, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + // Scale f32 registerSolutions( enumerateContractionSolutions<2, diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index f2e4a0fb..b9b382c0 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -24,24 +24,40 @@ # ############################################################################### -set(CK_CONTRACTION_INSTANCE_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp -) + set(CK_CONTRACTION_INSTANCE_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp + ) -add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) -target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) + add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) + target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..7d777a83 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..a9a97148 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..d83d8d16 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..bc49c82b --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..a9d963ab --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..c139942e --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..3c6ced30 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..33c66296 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..05400151 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..bba95b14 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..fb5ecec0 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..1dd6613c --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..e98aee20 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..db8de1c0 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..397ef327 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..1f9221dc --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear.hpp similarity index 100% rename from library/src/contraction/device/contraction_bilinear.hpp rename to library/src/contraction/device/hiptensor_contraction_bilinear.hpp diff --git a/library/src/contraction/device/contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale.hpp similarity index 100% rename from library/src/contraction/device/contraction_scale.hpp rename to library/src/contraction/device/hiptensor_contraction_scale.hpp diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index 09f5ddf6..b96a204e 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -708,17 +708,6 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, return errorCode; } - if(plan->mContractionDesc.mComputeType != plan->mContractionDesc.mTensorDesc[3].mType) - { - auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; - snprintf(msg, - sizeof(msg), - "Internal Error : compute type != D type (%s)", - hiptensorGetErrorString(errorCode)); - logger->logError("hiptensorContraction", msg); - return errorCode; - } - auto* cSolution = (hiptensor::ContractionSolution*)(plan->mSolution); auto canRun = cSolution->initArgs(alpha, diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp index 9740d2a8..51af1f48 100644 --- a/library/src/hiptensor.cpp +++ b/library/src/hiptensor.cpp @@ -152,7 +152,8 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t* han } if((lens == nullptr) - || ((dataType != HIP_R_16F) && (dataType != HIP_R_32F) && (dataType != HIP_R_64F)) + || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F) + && (dataType != HIP_R_64F)) || unaryOp != HIPTENSOR_OP_IDENTITY) { auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index ada3ce61..15972d60 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -26,15 +26,30 @@ # Check whether building within hiptensor context if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) + add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) + add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) + add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) # If building hipTensor samples as a standalone Cmake project else() + add_executable(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) + target_link_libraries(simple_contraction_scale_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) + target_link_libraries(simple_contraction_scale_bf16 PRIVATE hiptensor::hiptensor) add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) + target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) + target_link_libraries(simple_contraction_bilinear_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor) diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp new file mode 100644 index 00000000..0a4a9314 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp @@ -0,0 +1,342 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef hip_bfloat16 ADataType; + typedef hip_bfloat16 BDataType; + typedef hip_bfloat16 CDataType; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_16BF; + hipDataType typeB = HIP_R_16BF; + hipDataType typeC = HIP_R_16BF; + hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.1f; + floatTypeCompute beta = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * + *C_{m,n,u,v} + **********************/ + + std::vector modeC{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeC = modeC.size(); + + std::unordered_map extent; + + extent['m'] = 5; + extent['n'] = 6; + extent['u'] = 3; + extent['v'] = 4; + extent['h'] = 3; + extent['k'] = 4; + + std::vector c_ms_ns_lengths; + for(auto mode : modeC) + { + c_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t c_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &c_ms_ns, + nmodeC, + c_ms_ns_lengths.data(), + NULL, /*stride*/ + typeC, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsC = std::accumulate( + c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeC = sizeof(CDataType) * elementsC; + + ADataType* A = nullptr; + BDataType* B = nullptr; + CDataType* C = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); + + void *A_d, *B_d, *C_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); + + /******************* + * Initialize data + *******************/ + for(int64_t i = 0; i < elementsA; i++) + { + A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsB; i++) + { + B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsC; i++) + { + C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementC; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "c_ms_ns: " << c_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + (void*)&beta, + C_d, + C_d, + workspace, + worksize, + 0 /* stream */)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor C elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorC; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorC.open("tensor_C_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(C); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(C_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16.cpp new file mode 100644 index 00000000..d9d044c9 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f16.cpp @@ -0,0 +1,342 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef _Float16 ADataType; + typedef _Float16 BDataType; + typedef _Float16 CDataType; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_16F; + hipDataType typeB = HIP_R_16F; + hipDataType typeC = HIP_R_16F; + hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.1f; + floatTypeCompute beta = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * + *C_{m,n,u,v} + **********************/ + + std::vector modeC{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeC = modeC.size(); + + std::unordered_map extent; + + extent['m'] = 5; + extent['n'] = 6; + extent['u'] = 3; + extent['v'] = 4; + extent['h'] = 3; + extent['k'] = 4; + + std::vector c_ms_ns_lengths; + for(auto mode : modeC) + { + c_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t c_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &c_ms_ns, + nmodeC, + c_ms_ns_lengths.data(), + NULL, /*stride*/ + typeC, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsC = std::accumulate( + c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeC = sizeof(CDataType) * elementsC; + + ADataType* A = nullptr; + BDataType* B = nullptr; + CDataType* C = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); + + void *A_d, *B_d, *C_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); + + /******************* + * Initialize data + *******************/ + for(int64_t i = 0; i < elementsA; i++) + { + A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsB; i++) + { + B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsC; i++) + { + C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementC; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "c_ms_ns: " << c_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + (void*)&beta, + C_d, + C_d, + workspace, + worksize, + 0 /* stream */)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor C elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorC; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorC.open("tensor_C_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(C); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(C_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16.cpp new file mode 100644 index 00000000..e05916bf --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_bf16.cpp @@ -0,0 +1,334 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + typedef hip_bfloat16 ADataType; + typedef hip_bfloat16 BDataType; + typedef hip_bfloat16 DDataType; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_16BF; + hipDataType typeB = HIP_R_16BF; + hipDataType typeD = HIP_R_16BF; + hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} + **********************/ + + std::vector modeD{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeD = modeD.size(); + + std::unordered_map extent; + + extent['m'] = 5; + extent['n'] = 6; + extent['u'] = 3; + extent['v'] = 4; + extent['h'] = 3; + extent['k'] = 4; + + std::vector d_ms_ns_lengths; + for(auto mode : modeD) + { + d_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t d_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &d_ms_ns, + nmodeD, + d_ms_ns_lengths.data(), + NULL, /*stride*/ + typeD, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsD = std::accumulate( + d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeD = sizeof(DDataType) * elementsD; + + ADataType* A = nullptr; + BDataType* B = nullptr; + DDataType* D = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); + + void *A_d, *B_d, *D_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); + + /******************* + * Initialize data + *******************/ + for(int64_t i = 0; i < elementsA; i++) + { + A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsB; i++) + { + B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsD; i++) + { + D[i] = std::numeric_limits::signaling_NaN(); + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementD; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "d_ms_ns: " << d_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + nullptr, + nullptr, + 0, + &d_ms_ns, + modeD.data(), + alignmentRequirementD, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + nullptr, + nullptr, + D_d, + workspace, + worksize, + 0 /* stream */)); + + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsD < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, D, elementsD); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorD; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorD.open("tensor_D_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); + tensorD.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(D); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(D_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16.cpp new file mode 100644 index 00000000..1e62be85 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f16.cpp @@ -0,0 +1,334 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + typedef _Float16 ADataType; + typedef _Float16 BDataType; + typedef _Float16 DDataType; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_16F; + hipDataType typeB = HIP_R_16F; + hipDataType typeD = HIP_R_16F; + hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} + **********************/ + + std::vector modeD{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeD = modeD.size(); + + std::unordered_map extent; + + extent['m'] = 5; + extent['n'] = 6; + extent['u'] = 3; + extent['v'] = 4; + extent['h'] = 3; + extent['k'] = 4; + + std::vector d_ms_ns_lengths; + for(auto mode : modeD) + { + d_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t d_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &d_ms_ns, + nmodeD, + d_ms_ns_lengths.data(), + NULL, /*stride*/ + typeD, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsD = std::accumulate( + d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeD = sizeof(DDataType) * elementsD; + + ADataType* A = nullptr; + BDataType* B = nullptr; + DDataType* D = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); + + void *A_d, *B_d, *D_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); + + /******************* + * Initialize data + *******************/ + for(int64_t i = 0; i < elementsA; i++) + { + A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsB; i++) + { + B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsD; i++) + { + D[i] = std::numeric_limits::signaling_NaN(); + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementD; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "d_ms_ns: " << d_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + nullptr, + nullptr, + 0, + &d_ms_ns, + modeD.data(), + alignmentRequirementD, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + nullptr, + nullptr, + D_d, + workspace, + worksize, + 0 /* stream */)); + + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsD < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, D, elementsD); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorD; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorD.open("tensor_D_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); + tensorD.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(D); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(D_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index 2bd90e90..a08065a0 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -1,6 +1,8 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: + - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F] - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F] Algorithm Types: diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index 329f1b84..b28e9a88 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -1,6 +1,8 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: + - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] Algorithm Types: diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index 5d745d12..9446157f 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -56,7 +56,8 @@ namespace hiptensor // False = skip test bool ContractionTest::checkDevice(hipDataType datatype) const { - return (isF32Supported() && datatype == HIP_R_32F) + return (isF32Supported() + && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF)) || (isF64Supported() && datatype == HIP_R_64F); } @@ -115,11 +116,15 @@ namespace hiptensor auto CDataType = testType[2]; auto DDataType = testType[3]; - EXPECT_TRUE((ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)); - EXPECT_TRUE((BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)); - EXPECT_TRUE((CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) + EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF) + || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)); + EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF) + || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)); + EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF) + || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) || (CDataType == NONE_TYPE)); - EXPECT_TRUE((DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); + EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF) + || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); mRunFlag &= checkDevice(DDataType); @@ -228,7 +233,35 @@ namespace hiptensor auto resource = getResource(); resource->resizeStorage(lengths, elementBytes); - if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F) + if(ADataType == HIP_R_16F && BDataType == HIP_R_16F && DDataType == HIP_R_16F) + { + // Initialize matrix data on device + fillLaunchKernel<_Float16>((_Float16*)resource->deviceA().get(), elementsA); + fillLaunchKernel<_Float16>((_Float16*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_R_16F) + { + fillLaunchKernel<_Float16>((_Float16*)resource->deviceC().get(), elementsCD); + } + fillValLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(), + elementsCD, + std::numeric_limits<_Float16>::signaling_NaN()); + } + else if(ADataType == HIP_R_16BF && BDataType == HIP_R_16BF && DDataType == HIP_R_16BF) + { + // Initialize matrix data on device + fillLaunchKernel((hip_bfloat16*)resource->deviceA().get(), elementsA); + fillLaunchKernel((hip_bfloat16*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_R_16BF) + { + fillLaunchKernel((hip_bfloat16*)resource->deviceC().get(), + elementsCD); + } + fillValLaunchKernel( + (hip_bfloat16*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); + } + else if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F) { // Initialize matrix data on device fillLaunchKernel((float*)resource->deviceA().get(), elementsA); @@ -328,7 +361,7 @@ namespace hiptensor { auto resource = getResource(); - int size = ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double)); + int size = hipDataTypeSize(DDataType); size_t elementsA = std::accumulate(a_ms_ks.mLengths.begin(), a_ms_ks.mLengths.end(), @@ -346,7 +379,50 @@ namespace hiptensor auto D = resource->allocHost(elementsCD * size); resource->copyData(D, resource->deviceD(), elementsCD * size); - if(DDataType == HIP_R_32F) + if(DDataType == HIP_R_16F) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements<_Float16>(stream, (_Float16*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_R_16BF) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_R_32F) { stream << "Tensor A elements:\n"; hiptensorPrintArrayElements( @@ -456,11 +532,24 @@ namespace hiptensor size_t{1}, std::multiplies()); - int sizeD = elementsCD * ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double)); + int sizeD = elementsCD * hipDataTypeSize(DDataType); auto reference = resource->allocDevice(sizeD); resource->copyData(reference, resource->hostD(), sizeD); - if(DDataType == HIP_R_32F) + if(DDataType == HIP_R_16F) + { + std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>( + (_Float16*)resource->deviceD().get(), (_Float16*)reference.get(), elementsCD); + } + else if(DDataType == HIP_R_16BF) + { + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel( + (hip_bfloat16*)resource->deviceD().get(), + (hip_bfloat16*)reference.get(), + elementsCD); + } + else if(DDataType == HIP_R_32F) { std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD); diff --git a/test/device/common.hpp b/test/device/common.hpp index f961abc1..172e6953 100644 --- a/test/device/common.hpp +++ b/test/device/common.hpp @@ -72,7 +72,7 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed) if(index < elementSize) { - auto value = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize; + auto value = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize; data[index] = static_cast(value); } } diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp index 46f4c43e..cd3eb46f 100644 --- a/test/llvm/yaml_parser_config.cpp +++ b/test/llvm/yaml_parser_config.cpp @@ -110,6 +110,7 @@ namespace llvm static void enumeration(IO& io, hipDataType& value) { io.enumCase(value, "HIP_R_16F", HIP_R_16F); + io.enumCase(value, "HIP_R_16BF", HIP_R_16BF); io.enumCase(value, "HIP_R_32F", HIP_R_32F); io.enumCase(value, "HIP_R_64F", HIP_R_64F); io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE); diff --git a/test/utils.hpp b/test/utils.hpp index 1f7ece44..ad4bb565 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -41,9 +41,9 @@ #include #include #include +#include #include "device/common.hpp" -#include "types.hpp" #define HIPTENSOR_FREE_DEVICE(ptr) \ if(ptr != nullptr) \ From 185a2ab115d4e6e8999917e349ca1a4d803e5228 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Sat, 25 Nov 2023 02:38:48 +0000 Subject: [PATCH 09/42] Add support to f32_f16, f32_bf16, f64_f32 to contraction - Support ABCD data type f32 and compute type f16, bf16 - Support ABCD data type f64 and compute type f32 - Fixed bug: alpha, beta were passed in as wrong data type in unit test of contraction - Create sample template of contraction --- .../contraction/contraction_cpu_reference.cpp | 48 +- .../contraction/contraction_cpu_reference.hpp | 39 +- .../contraction_cpu_reference_instances.cpp | 104 ++- .../contraction/contraction_meta_traits.hpp | 18 +- .../src/contraction/contraction_selection.cpp | 737 ++++++++++++++---- .../src/contraction/contraction_selection.hpp | 9 +- .../contraction/contraction_solution_impl.hpp | 9 +- .../contraction_solution_instances.cpp | 88 ++- .../contraction_solution_params.hpp | 9 +- .../contraction_solution_params_impl.hpp | 6 + .../contraction_solution_registry.cpp | 81 +- .../contraction_solution_registry.hpp | 60 +- library/src/contraction/device/CMakeLists.txt | 28 +- ...16_bf16_bf16_compute_f32_kknn_instance.cpp | 27 +- ...16_bf16_bf16_compute_f32_knnn_instance.cpp | 27 +- ...16_bf16_bf16_compute_f32_mknn_instance.cpp | 27 +- ...16_bf16_bf16_compute_f32_mnnn_instance.cpp | 27 +- ..._f16_f16_f16_compute_f32_kknn_instance.cpp | 27 +- ..._f16_f16_f16_compute_f32_knnn_instance.cpp | 27 +- ..._f16_f16_f16_compute_f32_mknn_instance.cpp | 27 +- ..._f16_f16_f16_compute_f32_mnnn_instance.cpp | 27 +- ...f32_f32_f32_compute_bf16_kknn_instance.cpp | 85 ++ ...f32_f32_f32_compute_bf16_knnn_instance.cpp | 85 ++ ...f32_f32_f32_compute_bf16_mknn_instance.cpp | 85 ++ ...f32_f32_f32_compute_bf16_mnnn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_f16_kknn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_f16_knnn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_f16_mknn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_f16_mnnn_instance.cpp | 85 ++ ..._f64_f64_f64_compute_f32_kknn_instance.cpp | 85 ++ ..._f64_f64_f64_compute_f32_knnn_instance.cpp | 85 ++ ..._f64_f64_f64_compute_f32_mknn_instance.cpp | 85 ++ ..._f64_f64_f64_compute_f32_mnnn_instance.cpp | 85 ++ ...f16_bf16_bf16_compute_f32_kkn_instance.cpp | 27 +- ...f16_bf16_bf16_compute_f32_knn_instance.cpp | 27 +- ...f16_bf16_bf16_compute_f32_mkn_instance.cpp | 27 +- ...f16_bf16_bf16_compute_f32_mnn_instance.cpp | 27 +- ...e_f16_f16_f16_compute_f32_kkn_instance.cpp | 27 +- ...e_f16_f16_f16_compute_f32_knn_instance.cpp | 27 +- ...e_f16_f16_f16_compute_f32_mkn_instance.cpp | 27 +- ...e_f16_f16_f16_compute_f32_mnn_instance.cpp | 27 +- ..._f32_f32_f32_compute_bf16_kkn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_bf16_knn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_bf16_mkn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_bf16_mnn_instance.cpp | 85 ++ ...e_f32_f32_f32_compute_f16_kkn_instance.cpp | 85 ++ ...e_f32_f32_f32_compute_f16_knn_instance.cpp | 85 ++ ...e_f32_f32_f32_compute_f16_mkn_instance.cpp | 85 ++ ...e_f32_f32_f32_compute_f16_mnn_instance.cpp | 85 ++ ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 62 ++ ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 62 ++ ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 62 ++ ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 62 ++ .../src/contraction/hiptensor_contraction.cpp | 59 +- library/src/data_types.cpp | 43 + library/src/include/data_types.hpp | 2 + samples/01_contraction/CMakeLists.txt | 31 + .../simple_bilinear_contraction.hpp | 351 +++++++++ .../simple_bilinear_contraction_bf16.cpp | 313 +------- .../simple_bilinear_contraction_f16.cpp | 313 +------- .../simple_bilinear_contraction_f32.cpp | 313 +------- .../simple_bilinear_contraction_f32_bf16.cpp | 57 ++ .../simple_bilinear_contraction_f32_f16.cpp | 57 ++ .../simple_bilinear_contraction_f64.cpp | 57 ++ .../simple_bilinear_contraction_f64_f32.cpp | 57 ++ .../simple_scale_contraction.hpp | 341 ++++++++ .../simple_scale_contraction_bf16.cpp | 311 +------- .../simple_scale_contraction_f16.cpp | 317 +------- .../simple_scale_contraction_f32.cpp | 310 +------- .../simple_scale_contraction_f32_bf16.cpp | 58 ++ .../simple_scale_contraction_f32_f16.cpp | 58 ++ .../simple_scale_contraction_f64.cpp | 57 ++ .../simple_scale_contraction_f64_f32.cpp | 57 ++ .../configs/bilinear_test_params.yaml | 7 +- .../configs/scale_test_params.yaml | 3 + test/01_contraction/contraction_test.cpp | 18 +- 76 files changed, 4657 insertions(+), 2149 deletions(-) create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction.hpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f64.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp create mode 100644 samples/01_contraction/simple_scale_contraction.hpp create mode 100644 samples/01_contraction/simple_scale_contraction_f32_bf16.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_f32_f16.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_f64.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_f64_f32.cpp diff --git a/library/src/contraction/contraction_cpu_reference.cpp b/library/src/contraction/contraction_cpu_reference.cpp index 13dcdffd..ac1d9711 100644 --- a/library/src/contraction/contraction_cpu_reference.cpp +++ b/library/src/contraction/contraction_cpu_reference.cpp @@ -28,31 +28,33 @@ #include "contraction_cpu_reference_impl.hpp" #include "contraction_cpu_reference_instances.hpp" -hiptensorStatus_t hiptensorContractionReference(void const* alpha, - void const* A, - void const* B, - void const* beta, - void const* C, - void* D, - std::vector const& a_ms_ks_lengths, - std::vector const& a_ms_ks_strides, - std::vector const& b_ns_ks_lengths, - std::vector const& b_ns_ks_strides, - std::vector const& c_ms_ns_lengths, - std::vector const& c_ms_ns_strides, - std::vector const& d_ms_ns_lengths, - std::vector const& d_ms_ns_strides, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - void* workspace) +hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan, + void const* alpha, + void const* A, + void const* B, + void const* beta, + void const* C, + void* D, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + std::vector const& c_ms_ns_lengths, + std::vector const& c_ms_ns_strides, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + void* workspace) { - auto& instances = hiptensor::ContractionCpuReferenceInstances::instance(); + auto& instances = hiptensor::ContractionCpuReferenceInstances::instance(); + auto computeType = plan->mContractionDesc.mComputeType; auto candidates - = (C == nullptr) - ? instances->allSolutions().query(typeA, typeB, hiptensor::NONE_TYPE, typeD) - : instances->allSolutions().query(typeA, typeB, typeC, typeD); + = (C == nullptr) ? instances->allSolutions().query( + typeA, typeB, hiptensor::NONE_TYPE, typeD, computeType) + : instances->allSolutions().query(typeA, typeB, typeC, typeD, computeType); auto toCKVec = [](auto& inputVec) { return std::vector(inputVec.begin(), inputVec.end()); }; diff --git a/library/src/contraction/contraction_cpu_reference.hpp b/library/src/contraction/contraction_cpu_reference.hpp index aadb062e..471026dc 100644 --- a/library/src/contraction/contraction_cpu_reference.hpp +++ b/library/src/contraction/contraction_cpu_reference.hpp @@ -32,24 +32,25 @@ #include -hiptensorStatus_t hiptensorContractionReference(void const* alpha, - void const* A, - void const* B, - void const* beta, - void const* C, - void* D, - std::vector const& a_ms_ks_lengths, - std::vector const& a_ms_ks_strides, - std::vector const& b_ks_ns_lengths, - std::vector const& b_ks_ns_strides, - std::vector const& c_ms_ns_lengths, - std::vector const& c_ms_ns_strides, - std::vector const& d_ms_ns_lengths, - std::vector const& d_ms_ns_strides, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - void* workspace); +hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan, + void const* alpha, + void const* A, + void const* B, + void const* beta, + void const* C, + void* D, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + std::vector const& b_ks_ns_lengths, + std::vector const& b_ks_ns_strides, + std::vector const& c_ms_ns_lengths, + std::vector const& c_ms_ns_strides, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + void* workspace); #endif // HIPTENSOR_CONTRACTION_CPU_REFERENCE_HPP diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 146d2721..173a49e9 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -37,10 +37,10 @@ namespace hiptensor enumerateReferenceSolutions<2, 2, 2, - _Float16, - _Float16, - ck::Tuple<_Float16>, - _Float16, + ck::half_t, + ck::half_t, + ck::Tuple, + ck::half_t, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Bilinear, @@ -71,7 +71,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + ck::half_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::bhalf_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); // Bilinear f64 registerSolutions( @@ -84,17 +111,31 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + double, + double, + ck::Tuple, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + double>()); // Scale f16 registerSolutions( enumerateReferenceSolutions<2, 2, 2, - _Float16, - _Float16, + ck::half_t, + ck::half_t, ck::Tuple<>, - _Float16, + ck::half_t, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Scale, @@ -125,7 +166,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + ck::half_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::bhalf_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); // Scale f64 registerSolutions( @@ -138,6 +206,20 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + double, + double, + ck::Tuple<>, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + double>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index ab158f96..6a7cb35f 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -75,9 +75,12 @@ namespace hiptensor = std::conditional_t, hip_bfloat16, DsDataType>; using EDataT = std::conditional_t, hip_bfloat16, EDataType>; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Bilinear; + using ComputeDataT = std::conditional_t, + hip_bfloat16, + ComputeDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Bilinear; }; // Partial specialize for Scale contraction @@ -113,9 +116,12 @@ namespace hiptensor using DDataT = NoneType; using EDataT = std::conditional_t, hip_bfloat16, EDataType>; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Scale; + using ComputeDataT = std::conditional_t, + hip_bfloat16, + ComputeDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Scale; }; } // namespace hiptensor diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index 1b2cf92e..888ef4c1 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -54,6 +54,7 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize) { // Make sure that we calculate full element space incase strides are not packed. @@ -70,9 +71,11 @@ namespace hiptensor auto sizeE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides) * hipDataTypeSize(typeE); - void *A_d, *B_d, *D_d, *E_d, *wspace; - float alpha = 1.02f; - float beta = 1.03f; + void * A_d, *B_d, *D_d, *E_d, *wspace; + double alpha = 0.0d; + double beta = 0.0d; + writeVal(&alpha, computeType, 1.02); + writeVal(&beta, computeType, 1.03); CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA)); CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB)); @@ -150,9 +153,13 @@ namespace hiptensor } } - // test template <> - struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::SCALE> + struct ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::SCALE, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -196,7 +203,12 @@ namespace hiptensor }; template <> - struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::BILINEAR> + struct ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::BILINEAR, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -224,7 +236,7 @@ namespace hiptensor size_t unique_id = 0; - // select unique_id + // TODO select unique_id unique_id = 7255639152084218514; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) @@ -244,7 +256,8 @@ namespace hiptensor hip_bfloat16, hip_bfloat16, hip_bfloat16, - ContractionOpId_t::SCALE> + ContractionOpId_t::SCALE, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -292,7 +305,8 @@ namespace hiptensor hip_bfloat16, hip_bfloat16, hip_bfloat16, - ContractionOpId_t::BILINEAR> + ContractionOpId_t::BILINEAR, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -320,7 +334,7 @@ namespace hiptensor size_t unique_id = 0; - // select unique_id + // TODO select unique_id unique_id = 8689089455041651212; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) @@ -334,10 +348,183 @@ namespace hiptensor } } }; - // end test template <> - struct ActorCriticSelection + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -702,7 +889,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1060,7 +1247,92 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1335,7 +1607,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1602,181 +1874,344 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize) { - if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F) + if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F + && computeType == HIP_R_32F) { return ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, - ContractionOpId_t::SCALE>::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::SCALE, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F - && typeE == HIP_R_16F) + else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F + && computeType == HIP_R_32F) { return ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, - ContractionOpId_t::BILINEAR>::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::BILINEAR, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE - && typeE == HIP_R_16BF) + && typeE == HIP_R_16BF && computeType == HIP_R_32F) { return ActorCriticSelection::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::SCALE, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF - && typeE == HIP_R_16BF) + && typeE == HIP_R_16BF && computeType == HIP_R_32F) { return ActorCriticSelection::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::BILINEAR, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_16F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_16F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE - && typeE == HIP_R_32F) + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_32F) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F - && typeE == HIP_R_32F) + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_32F) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE - && typeE == HIP_R_64F) + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F + && computeType == HIP_R_32F) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F + && computeType == HIP_R_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F + && computeType == HIP_R_64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F - && typeE == HIP_R_64F) + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F + && computeType == HIP_R_64F) { return ActorCriticSelection::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::BILINEAR, + double>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } return HIPTENSOR_STATUS_EXECUTION_FAILED; } diff --git a/library/src/contraction/contraction_selection.hpp b/library/src/contraction/contraction_selection.hpp index 9ceb6a14..deb980d9 100644 --- a/library/src/contraction/contraction_selection.hpp +++ b/library/src/contraction/contraction_selection.hpp @@ -49,9 +49,15 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize); - template + template struct ActorCriticSelection { static hiptensorStatus_t @@ -87,6 +93,7 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize); } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 5e191441..3b672fbb 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -95,11 +95,13 @@ namespace hiptensor if(alpha != nullptr) { - alphaF = hiptensor::readVal(alpha, HipDataType_v); + alphaF = hiptensor::readVal( + alpha, convertToComputeType(HipDataType_v)); } if(beta != nullptr) { - betaF = hiptensor::readVal(beta, HipDataType_v); + betaF = hiptensor::readVal( + beta, convertToComputeType(HipDataType_v)); } // CK has its own format for indices... @@ -205,7 +207,8 @@ namespace hiptensor if(alpha != nullptr) { - alphaF = hiptensor::readVal(alpha, HipDataType_v); + alphaF = hiptensor::readVal( + alpha, convertToComputeType(HipDataType_v)); } // CK has its own format for indices... diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index 6d481577..aec12e32 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -72,7 +72,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::half_t>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::bhalf_t>()); // Bilinear f64 registerSolutions( @@ -85,7 +112,20 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + double, + double, + ck::Tuple, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + double>()); // Scale bf16 registerSolutions( @@ -126,8 +166,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::half_t>()); + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::bhalf_t>()); // Scale f64 registerSolutions( enumerateContractionSolutions<2, @@ -139,6 +205,20 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + double, + double, + ck::Tuple<>, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + double>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp index ec9de45c..4c44de88 100644 --- a/library/src/contraction/contraction_solution_params.hpp +++ b/library/src/contraction/contraction_solution_params.hpp @@ -49,10 +49,11 @@ namespace hiptensor virtual int32_t dimsK() const = 0; // Map to hipDataType - virtual hipDataType typeA() const = 0; - virtual hipDataType typeB() const = 0; - virtual hipDataType typeC() const = 0; - virtual hipDataType typeD() const = 0; + virtual hipDataType typeA() const = 0; + virtual hipDataType typeB() const = 0; + virtual hipDataType typeC() const = 0; + virtual hipDataType typeD() const = 0; + virtual hiptensorComputeType_t typeCompute() const = 0; // Map to operators virtual hiptensorOperator_t opA() const = 0; diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp index bff33960..b84f9c2b 100644 --- a/library/src/contraction/contraction_solution_params_impl.hpp +++ b/library/src/contraction/contraction_solution_params_impl.hpp @@ -42,6 +42,7 @@ namespace std return hiptensor::Hash{}(s.dimsM(), s.dimsN(), s.dimsK(), + s.typeCompute(), s.typeA(), s.typeB(), s.typeC(), @@ -102,6 +103,11 @@ namespace hiptensor return HipDataType_v; } + hiptensorComputeType_t typeCompute() const override + { + return convertToComputeType(HipDataType_v); + } + hiptensorOperator_t opA() const override { return ElementWiseOperatorType_v; diff --git a/library/src/contraction/contraction_solution_registry.cpp b/library/src/contraction/contraction_solution_registry.cpp index 83674c81..9e2da1f9 100644 --- a/library/src/contraction/contraction_solution_registry.cpp +++ b/library/src/contraction/contraction_solution_registry.cpp @@ -53,19 +53,20 @@ namespace hiptensor } ContractionSolutionRegistry::Query - ContractionSolutionRegistry::Query::query(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) const + ContractionSolutionRegistry::Query::query(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) const { - auto solutionHash - = hashSolution(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE); + auto solutionHash = hashSolution( + dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute); if(auto solutions = mSolutionHash.find(solutionHash); solutions != mSolutionHash.end()) { @@ -81,10 +82,14 @@ namespace hiptensor return query(hashDimsMNK(dimsM, dimsN, dimsK)); } - ContractionSolutionRegistry::Query ContractionSolutionRegistry::Query::query( - hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) const + ContractionSolutionRegistry::Query + ContractionSolutionRegistry::Query::query(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) const { - return query(hashTypesABCD(typeA, typeB, typeC, typeD)); + return query(hashTypesComputeABCD(typeA, typeB, typeC, typeD, typeCompute)); } ContractionSolutionRegistry::Query @@ -159,18 +164,20 @@ namespace hiptensor /* static */ ContractionSolutionRegistry::Query::HashId - ContractionSolutionRegistry::Query::hashSolution(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) + ContractionSolutionRegistry::Query::hashSolution(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) { - return Hash{}(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE); + return Hash{}( + dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute); } /* static */ @@ -181,10 +188,14 @@ namespace hiptensor } /* static */ - ContractionSolutionRegistry::Query::HashId ContractionSolutionRegistry::Query::hashTypesABCD( - hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) + ContractionSolutionRegistry::Query::HashId + ContractionSolutionRegistry::Query::hashTypesComputeABCD(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) { - return Hash{}(typeA, typeB, typeC, typeD); + return Hash{}(typeA, typeB, typeC, typeD, typeCompute); } /* static */ @@ -220,12 +231,16 @@ namespace hiptensor params->typeD(), params->opA(), params->opB(), - params->opCDE()); + params->opCDE(), + params->typeCompute()); auto dimsMNKHash = hashDimsMNK(params->dimsM(), params->dimsN(), params->dimsK()); - auto typesABCDHash - = hashTypesABCD(params->typeA(), params->typeB(), params->typeC(), params->typeD()); + auto typesComputeABCDHash = hashTypesComputeABCD(params->typeA(), + params->typeB(), + params->typeC(), + params->typeD(), + params->typeCompute()); auto elementOpsHash = hashElementOps(params->opA(), params->opB()); @@ -236,7 +251,7 @@ namespace hiptensor mAllSolutions[solutionUid] = solution; mSolutionHash[solutionHash].push_back(solution); mSolutionHash[dimsMNKHash].push_back(solution); - mSolutionHash[typesABCDHash].push_back(solution); + mSolutionHash[typesComputeABCDHash].push_back(solution); mSolutionHash[elementOpsHash].push_back(solution); mSolutionHash[contactionOpsHash].push_back(solution); } diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp index d1b80ec5..44aaa97d 100644 --- a/library/src/contraction/contraction_solution_registry.hpp +++ b/library/src/contraction/contraction_solution_registry.hpp @@ -59,25 +59,27 @@ namespace hiptensor /// E.g. in this context, query further parameters. // By full solution type - Query query(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) const; + Query query(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) const; // By dimensions Query query(int32_t dimsM, int32_t dimsN, int32_t dimsK) const; // By data types - Query query(hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD) const; + Query query(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) const; // By element-wise operations Query query(hiptensorOperator_t opA, hiptensorOperator_t opB) const; @@ -104,22 +106,24 @@ namespace hiptensor Query query(HashId queryHash) const; // Hashing helpers - static HashId hashSolution(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE); + static HashId hashSolution(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute); static HashId hashDimsMNK(int32_t dimsM, int32_t dimsN, int32_t dimsK); - static HashId hashTypesABCD(hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD); + static HashId hashTypesComputeABCD(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute); static HashId hashElementOps(hiptensorOperator_t opA, hiptensorOperator_t opB); static HashId hashContractionOps(ContractionOpId_t opCDE); diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index b9b382c0..eacac5b1 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -33,10 +33,22 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -49,15 +61,27 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp ) - add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) - target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) +add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) +target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp index 7d777a83..3b3f6d47 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp index a9a97148..fd43f0ad 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp index d83d8d16..21fb8127 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp index bc49c82b..cc975c03 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp index a9d963ab..ff670630 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp index c139942e..be8bfe84 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp index 3c6ced30..4be69898 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp index 33c66296..2f6d630b 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp new file mode 100644 index 00000000..cc21216c --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp new file mode 100644 index 00000000..57c47457 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp new file mode 100644 index 00000000..a121fbb3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp new file mode 100644 index 00000000..7962da9f --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp new file mode 100644 index 00000000..ea2be147 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp new file mode 100644 index 00000000..d82ea442 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp new file mode 100644 index 00000000..772df2e3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp new file mode 100644 index 00000000..8b1d0681 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..ad5ce461 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..ae3ee856 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..b72005ad --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..b94030e5 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp index 05400151..1da8301f 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp index bba95b14..82c17500 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp index fb5ecec0..1febb560 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp index 1dd6613c..02b9d719 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp index e98aee20..5917e466 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp index db8de1c0..216f470e 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp index 397ef327..3401b605 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp index 1f9221dc..fe2fa97d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp new file mode 100644 index 00000000..9a104075 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp new file mode 100644 index 00000000..6a7f565f --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp new file mode 100644 index 00000000..094655bb --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp new file mode 100644 index 00000000..583b5b00 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp new file mode 100644 index 00000000..8eec79cf --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp new file mode 100644 index 00000000..a8999be8 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp new file mode 100644 index 00000000..e4e4b7de --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp new file mode 100644 index 00000000..a641f6e3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..04176d80 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..06481fc7 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..94922008 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..e70b854b --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index b96a204e..c7b7501b 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -242,17 +242,6 @@ hiptensorStatus_t hiptensorInitContractionFind(const hiptensorHandle_t* handl auto& instances = hiptensor::ContractionSolutionInstances::instance(); auto solnQ = instances->allSolutions(); - // Check if the current device supports F64 - if(!currentDevice.supportsF64()) - { - // Allow only supported f32 combos - solnQ = solnQ.query(HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F) || // Bilinear F32 - solnQ.query(HIP_R_32F, - HIP_R_32F, - hipDataType(hiptensor::NONE_TYPE), - HIP_R_32F); // Scale F32 (no C) - } - // Can do more checking for scale / bilinear, etc. if we need to. if(solnQ.solutionCount() == 0) @@ -461,15 +450,16 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* // Convert to concrete contraction solutions auto candidates = toContractionSolutionVec(find->mCandidates); - auto ADataType = desc->mTensorDesc[0].mType; - auto BDataType = desc->mTensorDesc[1].mType; - auto DDataType = desc->mTensorDesc[2].mType; - auto EDataType = desc->mTensorDesc[3].mType; + auto computeType = desc->mComputeType; + auto ADataType = desc->mTensorDesc[0].mType; + auto BDataType = desc->mTensorDesc[1].mType; + auto DDataType = desc->mTensorDesc[2].mType; + auto EDataType = desc->mTensorDesc[3].mType; // Query contraction solutions for the correct contraction operation and type auto solutionQ = hiptensor::ContractionSolutionRegistry::Query{candidates} .query((hiptensor::ContractionOpId_t)desc->mContractionOpId) - .query(ADataType, BDataType, DDataType, EDataType); + .query(ADataType, BDataType, DDataType, EDataType, computeType); candidates = toContractionSolutionVec(solutionQ.solutions()); @@ -500,6 +490,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* EDataType, desc->mTensorDesc[3].mLengths, desc->mTensorDesc[3].mStrides, + desc->mComputeType, workspaceSize); } else if(find->mSelectionAlgorithm == HIPTENSOR_ALGO_ACTOR_CRITIC) @@ -518,6 +509,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* EDataType, desc->mTensorDesc[3].mLengths, desc->mTensorDesc[3].mStrides, + desc->mComputeType, workspaceSize); } @@ -582,18 +574,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F) - { - snprintf( - alphaMsg, sizeof(alphaMsg), "alpha=%.6f", *(static_cast(alpha))); - } - else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F) - { - snprintf(alphaMsg, - sizeof(alphaMsg), - "alpha=%.6lf", - *(static_cast(alpha))); - } + auto alphaValue + = hiptensor::readVal(alpha, plan->mContractionDesc.mComputeType); + snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%.6lf", alphaValue); } if(beta == nullptr) @@ -602,15 +585,8 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F) - { - snprintf(betaMsg, sizeof(betaMsg), "beta=%.6f", *(static_cast(beta))); - } - else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F) - { - snprintf( - betaMsg, sizeof(betaMsg), "beta=%.6lf", *(static_cast(beta))); - } + auto betaValue = hiptensor::readVal(beta, plan->mContractionDesc.mComputeType); + snprintf(betaMsg, sizeof(betaMsg), "beta=%.6lf", betaValue); } } else @@ -745,6 +721,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE) { auto time = (*cSolution)(StreamConfig{stream, true}); + if(time < 0) + { + return HIPTENSOR_STATUS_CK_ERROR; + } int32_t m, n, k; std::tie(m, n, k) = cSolution->problemDims(); @@ -773,7 +753,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, // Perform contraction without timing else { - (*cSolution)(StreamConfig{stream, false}); + if((*cSolution)(StreamConfig{stream, false}) < 0) + { + return HIPTENSOR_STATUS_CK_ERROR; + } } return HIPTENSOR_STATUS_SUCCESS; diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp index b270973d..38e9f186 100644 --- a/library/src/data_types.cpp +++ b/library/src/data_types.cpp @@ -132,6 +132,49 @@ namespace hiptensor } } + void writeVal(void const* addr, hiptensorComputeType_t id, double value) + { + if(id == HIPTENSOR_COMPUTE_16F) + { + *(_Float16*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_16BF) + { + *(hip_bfloat16*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32F) + { + *(float*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_64F) + { + *(double*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_8U) + { + *(uint8_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_8I) + { + *(int8_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32U) + { + *(uint32_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32I) + { + *(int32_t*)addr = value; + } + else + { +#if !NDEBUG + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; +#endif // !NDEBUG + return; + } + } + } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType) diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp index 42197650..19ccca6c 100644 --- a/library/src/include/data_types.hpp +++ b/library/src/include/data_types.hpp @@ -65,6 +65,8 @@ namespace hiptensor template T readVal(void const* value, hiptensorComputeType_t id); + void writeVal(void const* addr, hiptensorComputeType_t id, double value); + } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType); diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index 15972d60..de834d72 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -29,9 +29,17 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) + add_hiptensor_sample(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp) + add_hiptensor_sample(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp) + add_hiptensor_sample(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp) + add_hiptensor_sample(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp) add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp) # If building hipTensor samples as a standalone Cmake project else() @@ -44,6 +52,18 @@ else() add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp) + target_link_libraries(simple_contraction_scale_f32_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp) + target_link_libraries(simple_contraction_scale_f32_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp) + target_link_libraries(simple_contraction_scale_f64 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp) + target_link_libraries(simple_contraction_scale_f64_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor) @@ -53,4 +73,15 @@ else() add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp) + target_link_libraries(simple_contraction_bilinear_f32_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp) + target_link_libraries(simple_contraction_bilinear_f32_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp) + target_link_libraries(simple_contraction_bilinear_f64 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp) + target_link_libraries(simple_contraction_bilinear_f64_f32 PRIVATE hiptensor::hiptensor) endif() diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp new file mode 100644 index 00000000..aaef4a1b --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction.hpp @@ -0,0 +1,351 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +template +int bilinearContractionSample() +{ + floatTypeCompute alpha = (floatTypeCompute)1.0f; + floatTypeCompute beta = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * + *C_{m,n,u,v} + **********************/ + + std::vector modeC{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeC = modeC.size(); + + std::unordered_map extent; + + extent['m'] = 4; + extent['n'] = 3; + extent['u'] = 4; + extent['v'] = 3; + extent['h'] = 6; + extent['k'] = 5; + + std::vector c_ms_ns_lengths; + for(auto mode : modeC) + { + c_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t c_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &c_ms_ns, + nmodeC, + c_ms_ns_lengths.data(), + NULL, /*stride*/ + typeC, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsC = std::accumulate( + c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeC = sizeof(CDataType) * elementsC; + + ADataType* A = nullptr; + BDataType* B = nullptr; + CDataType* C = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); + + void *A_d, *B_d, *C_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); + + /******************* + * Initialize data + *******************/ + int initMethod = 0; // TODO read value from commandline + for(int64_t i = 0; i < elementsA; i++) + { + if(initMethod == 0) + { + A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + A[i] = (ADataType)(float(i) / 100); + } + } + + for(int64_t i = 0; i < elementsB; i++) + { + if(initMethod == 0) + { + B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + B[i] = (BDataType)(float(i) / 100); + } + } + + for(int64_t i = 0; i < elementsC; i++) + { + if(initMethod == 0) + { + C[i] = CDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + C[i] = (BDataType)(float(i) / 100); + } + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementC; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "c_ms_ns: " << c_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + (void*)&beta, + C_d, + C_d, + workspace, + worksize, + 0 /* stream */)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor C elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorC; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorC.open("tensor_C_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(C); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(C_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp index 0a4a9314..f6714a2f 100644 --- a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp @@ -23,17 +23,7 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_bilinear_contraction.hpp" int main(int argc, char* argv[]) { @@ -51,292 +41,17 @@ int main(int argc, char* argv[]) typedef hip_bfloat16 CDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_16BF; - hipDataType typeB = HIP_R_16BF; - hipDataType typeC = HIP_R_16BF; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.1f; - floatTypeCompute beta = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * - *C_{m,n,u,v} - **********************/ - - std::vector modeC{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeC = modeC.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector c_ms_ns_lengths; - for(auto mode : modeC) - { - c_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t c_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &c_ms_ns, - nmodeC, - c_ms_ns_lengths.data(), - NULL, /*stride*/ - typeC, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsC = std::accumulate( - c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeC = sizeof(CDataType) * elementsC; - - ADataType* A = nullptr; - BDataType* B = nullptr; - CDataType* C = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); - - void *A_d, *B_d, *C_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsC; i++) - { - C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementC; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "c_ms_ns: " << c_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - (void*)&beta, - C_d, - C_d, - workspace, - worksize, - 0 /* stream */)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsC < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor C elements:\n"; - hiptensorPrintArrayElements(std::cout, C, elementsC); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorC; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorC.open("tensor_C_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); - tensorC.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(C); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(C_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_16BF; + constexpr hipDataType typeB = HIP_R_16BF; + constexpr hipDataType typeC = HIP_R_16BF; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16.cpp index d9d044c9..40708c77 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f16.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f16.cpp @@ -23,17 +23,7 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_bilinear_contraction.hpp" int main(int argc, char* argv[]) { @@ -51,292 +41,17 @@ int main(int argc, char* argv[]) typedef _Float16 CDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_16F; - hipDataType typeB = HIP_R_16F; - hipDataType typeC = HIP_R_16F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.1f; - floatTypeCompute beta = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * - *C_{m,n,u,v} - **********************/ - - std::vector modeC{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeC = modeC.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector c_ms_ns_lengths; - for(auto mode : modeC) - { - c_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t c_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &c_ms_ns, - nmodeC, - c_ms_ns_lengths.data(), - NULL, /*stride*/ - typeC, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsC = std::accumulate( - c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeC = sizeof(CDataType) * elementsC; - - ADataType* A = nullptr; - BDataType* B = nullptr; - CDataType* C = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); - - void *A_d, *B_d, *C_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsC; i++) - { - C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementC; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "c_ms_ns: " << c_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - (void*)&beta, - C_d, - C_d, - workspace, - worksize, - 0 /* stream */)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsC < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor C elements:\n"; - hiptensorPrintArrayElements(std::cout, C, elementsC); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorC; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorC.open("tensor_C_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); - tensorC.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(C); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(C_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_16F; + constexpr hipDataType typeB = HIP_R_16F; + constexpr hipDataType typeC = HIP_R_16F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32.cpp index 5704a59d..ee046145 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f32.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f32.cpp @@ -23,17 +23,7 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_bilinear_contraction.hpp" int main(int argc, char* argv[]) { @@ -51,292 +41,17 @@ int main(int argc, char* argv[]) typedef float CDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_32F; - hipDataType typeB = HIP_R_32F; - hipDataType typeC = HIP_R_32F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.1f; - floatTypeCompute beta = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * - *C_{m,n,u,v} - **********************/ - - std::vector modeC{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeC = modeC.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector c_ms_ns_lengths; - for(auto mode : modeC) - { - c_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t c_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &c_ms_ns, - nmodeC, - c_ms_ns_lengths.data(), - NULL, /*stride*/ - typeC, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsC = std::accumulate( - c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeC = sizeof(CDataType) * elementsC; - - ADataType* A = nullptr; - BDataType* B = nullptr; - CDataType* C = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); - - void *A_d, *B_d, *C_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsC; i++) - { - C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementC; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "c_ms_ns: " << c_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - (void*)&beta, - C_d, - C_d, - workspace, - worksize, - 0 /* stream */)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsC < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor C elements:\n"; - hiptensorPrintArrayElements(std::cout, C, elementsC); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorC; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorC.open("tensor_C_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); - tensorC.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(C); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(C_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp new file mode 100644 index 00000000..42f60ecb --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef hip_bfloat16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp new file mode 100644 index 00000000..d39a4fca --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef _Float16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64.cpp new file mode 100644 index 00000000..412ebbc5 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f64.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double CDataType; + typedef double floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeC = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp new file mode 100644 index 00000000..673c4768 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeC = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp new file mode 100644 index 00000000..e9d482c3 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction.hpp @@ -0,0 +1,341 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +template +int scaleContractionSample() +{ + floatTypeCompute alpha = (floatTypeCompute)1.0f; + /********************** + * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} + **********************/ + + std::vector modeD{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeD = modeD.size(); + + std::unordered_map extent; + + extent['m'] = 4; + extent['n'] = 3; + extent['u'] = 4; + extent['v'] = 3; + extent['h'] = 6; + extent['k'] = 5; + + std::vector d_ms_ns_lengths; + for(auto mode : modeD) + { + d_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t d_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &d_ms_ns, + nmodeD, + d_ms_ns_lengths.data(), + NULL, /*stride*/ + typeD, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsD = std::accumulate( + d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeD = sizeof(DDataType) * elementsD; + + ADataType* A = nullptr; + BDataType* B = nullptr; + DDataType* D = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); + + void *A_d, *B_d, *D_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); + + /******************* + * Initialize data + *******************/ + int initMethod = 0; // TODO read the value from command line + for(int64_t i = 0; i < elementsA; i++) + { + if(initMethod == 0) + { + A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + A[i] = (ADataType)(float(i) / 100); + } + } + + for(int64_t i = 0; i < elementsB; i++) + { + if(initMethod == 0) + { + B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + B[i] = (BDataType)(float(i) / 100); + } + } + + for(int64_t i = 0; i < elementsD; i++) + { + D[i] = std::numeric_limits::signaling_NaN(); + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementD; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "d_ms_ns: " << d_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + nullptr, + nullptr, + 0, + &d_ms_ns, + modeD.data(), + alignmentRequirementD, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + nullptr, + nullptr, + D_d, + workspace, + worksize, + 0 /* stream */)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsD < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, D, elementsD); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorD; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorD.open("tensor_D_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); + tensorD.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(D); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(D_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16.cpp index e05916bf..7b0f8b6c 100644 --- a/samples/01_contraction/simple_scale_contraction_bf16.cpp +++ b/samples/01_contraction/simple_scale_contraction_bf16.cpp @@ -23,16 +23,7 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_scale_contraction.hpp" int main(int argc, char* argv[]) { @@ -44,291 +35,17 @@ int main(int argc, char* argv[]) typedef hip_bfloat16 DDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_16BF; - hipDataType typeB = HIP_R_16BF; - hipDataType typeD = HIP_R_16BF; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} - **********************/ - - std::vector modeD{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeD = modeD.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector d_ms_ns_lengths; - for(auto mode : modeD) - { - d_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t d_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &d_ms_ns, - nmodeD, - d_ms_ns_lengths.data(), - NULL, /*stride*/ - typeD, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsD = std::accumulate( - d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeD = sizeof(DDataType) * elementsD; - - ADataType* A = nullptr; - BDataType* B = nullptr; - DDataType* D = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); - - void *A_d, *B_d, *D_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsD; i++) - { - D[i] = std::numeric_limits::signaling_NaN(); - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementD; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "d_ms_ns: " << d_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - nullptr, - nullptr, - 0, - &d_ms_ns, - modeD.data(), - alignmentRequirementD, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - nullptr, - nullptr, - D_d, - workspace, - worksize, - 0 /* stream */)); - - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsD < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor D elements:\n"; - hiptensorPrintArrayElements(std::cout, D, elementsD); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorD; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorD.open("tensor_D_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); - tensorD.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(D); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(D_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_16BF; + constexpr hipDataType typeB = HIP_R_16BF; + constexpr hipDataType typeD = HIP_R_16BF; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); } diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16.cpp index 1e62be85..d69193f0 100644 --- a/samples/01_contraction/simple_scale_contraction_f16.cpp +++ b/samples/01_contraction/simple_scale_contraction_f16.cpp @@ -23,312 +23,35 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_scale_contraction.hpp" int main(int argc, char* argv[]) { /*************************************** * Check device support * **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + typedef _Float16 ADataType; typedef _Float16 BDataType; typedef _Float16 DDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_16F; - hipDataType typeB = HIP_R_16F; - hipDataType typeD = HIP_R_16F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} - **********************/ - - std::vector modeD{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeD = modeD.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector d_ms_ns_lengths; - for(auto mode : modeD) - { - d_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t d_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &d_ms_ns, - nmodeD, - d_ms_ns_lengths.data(), - NULL, /*stride*/ - typeD, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsD = std::accumulate( - d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeD = sizeof(DDataType) * elementsD; - - ADataType* A = nullptr; - BDataType* B = nullptr; - DDataType* D = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); - - void *A_d, *B_d, *D_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsD; i++) - { - D[i] = std::numeric_limits::signaling_NaN(); - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementD; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "d_ms_ns: " << d_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - nullptr, - nullptr, - 0, - &d_ms_ns, - modeD.data(), - alignmentRequirementD, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - nullptr, - nullptr, - D_d, - workspace, - worksize, - 0 /* stream */)); - - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsD < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor D elements:\n"; - hiptensorPrintArrayElements(std::cout, D, elementsD); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorD; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorD.open("tensor_D_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); - tensorD.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(D); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(D_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_16F; + constexpr hipDataType typeB = HIP_R_16F; + constexpr hipDataType typeD = HIP_R_16F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); } diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32.cpp index c76ec370..e53cc468 100644 --- a/samples/01_contraction/simple_scale_contraction_f32.cpp +++ b/samples/01_contraction/simple_scale_contraction_f32.cpp @@ -23,16 +23,8 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include "common.hpp" +#include "simple_scale_contraction.hpp" int main(int argc, char* argv[]) { @@ -50,291 +42,17 @@ int main(int argc, char* argv[]) typedef float DDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_32F; - hipDataType typeB = HIP_R_32F; - hipDataType typeD = HIP_R_32F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} - **********************/ - - std::vector modeD{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeD = modeD.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector d_ms_ns_lengths; - for(auto mode : modeD) - { - d_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t d_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &d_ms_ns, - nmodeD, - d_ms_ns_lengths.data(), - NULL, /*stride*/ - typeD, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsD = std::accumulate( - d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeD = sizeof(DDataType) * elementsD; - - ADataType* A = nullptr; - BDataType* B = nullptr; - DDataType* D = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); - - void *A_d, *B_d, *D_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsD; i++) - { - D[i] = std::numeric_limits::signaling_NaN(); - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementD; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "d_ms_ns: " << d_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - nullptr, - nullptr, - 0, - &d_ms_ns, - modeD.data(), - alignmentRequirementD, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - nullptr, - nullptr, - D_d, - workspace, - worksize, - 0 /* stream */)); - - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsD < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor D elements:\n"; - hiptensorPrintArrayElements(std::cout, D, elementsD); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorD; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorD.open("tensor_D_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); - tensorD.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(D); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(D_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); } diff --git a/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp new file mode 100644 index 00000000..c11b8ded --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef hip_bfloat16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f16.cpp new file mode 100644 index 00000000..377ee707 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_f16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef _Float16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64.cpp new file mode 100644 index 00000000..5eb94c15 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f64.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF64Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double DDataType; + typedef double floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeD = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f64_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f32.cpp new file mode 100644 index 00000000..fdec48ab --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f64_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF64Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeD = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); +} diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index a08065a0..08ddf0b2 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -3,8 +3,11 @@ Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F] - - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index b28e9a88..08ddf0b2 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -4,7 +4,10 @@ Tensor Data Types: - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index 9446157f..ce67278f 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -125,6 +125,9 @@ namespace hiptensor || (CDataType == NONE_TYPE)); EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF) || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); + EXPECT_TRUE( + (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF) + || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F)); mRunFlag &= checkDevice(DDataType); @@ -488,7 +491,11 @@ namespace hiptensor auto CDataType = testType[2]; auto DDataType = testType[3]; - auto computeType = convertToComputeType(testType[4]); + auto computeType = convertToComputeType(testType[4]); + double alphaBuf = 0.; + double betaBuf = 0.; + writeVal(&alphaBuf, computeType, alpha); + writeVal(&betaBuf, computeType, beta); CHECK_HIPTENSOR_ERROR( hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); @@ -497,20 +504,21 @@ namespace hiptensor CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, &plan, - (void*)&alpha, + (void*)&alphaBuf, resource->deviceA().get(), resource->deviceB().get(), - (void*)&beta, + (void*)&betaBuf, resource->deviceC().get(), resource->deviceD().get(), workspace, worksize, 0 /* stream */)); - CHECK_HIPTENSOR_ERROR(hiptensorContractionReference((void*)&alpha, + CHECK_HIPTENSOR_ERROR(hiptensorContractionReference(&plan, + (void*)&alphaBuf, resource->hostA().get(), resource->hostB().get(), - (void*)&beta, + (void*)&betaBuf, resource->hostC().get(), resource->hostD().get(), a_ms_ks.mLengths, From ab8d557e0e68d29c5d3b17020c5c43ef898ede8f Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 30 Nov 2023 18:46:46 +0000 Subject: [PATCH 10/42] Add placeholder for solution unique_id Solution unique_ids of Actor Critic are have not been ready yet, but we put some placeholders in the new Actor Critic to make the unit tests be able to pass. --- .../src/contraction/contraction_selection.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index 888ef4c1..68c748b0 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -72,8 +72,8 @@ namespace hiptensor * hipDataTypeSize(typeE); void * A_d, *B_d, *D_d, *E_d, *wspace; - double alpha = 0.0d; - double beta = 0.0d; + double alpha = 0.0; + double beta = 0.0; writeVal(&alpha, computeType, 1.02); writeVal(&beta, computeType, 1.03); @@ -188,7 +188,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id - unique_id = 7255639152084218514; + unique_id = 7255639152084218514ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -237,7 +237,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id - unique_id = 7255639152084218514; + unique_id = 7255639152084218514ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -286,7 +286,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id - unique_id = 8689089455041651212; + unique_id = 8689089455041651212ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -335,7 +335,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id - unique_id = 8689089455041651212; + unique_id = 8689089455041651212ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -379,6 +379,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 1078559130597702989ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -421,6 +422,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 6506383527825239632ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -463,6 +465,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 14486135440731032454ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -510,6 +513,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 11931735240548010466ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1276,6 +1280,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 11912251726020349830ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1317,6 +1322,7 @@ namespace hiptensor int d6 = a_ms_ks_lengths[3]; size_t unique_id = 0; + unique_id = 15375432626310194825ull; // TODO select unique_id if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) From df27e326d15a65118a657b04c63eef37ecde946e Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Mon, 4 Dec 2023 16:12:44 +0000 Subject: [PATCH 11/42] Update contraction device instances Update contraction device instances since CK has updated them. --- ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 62 ++++++----------- ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 65 ++++++------------ ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 65 ++++++------------ ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 65 ++++++------------ ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 59 ++++++---------- ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 59 ++++++---------- ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 59 ++++++---------- ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 59 ++++++---------- ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 65 +++++++----------- ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 68 +++++++------------ ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 68 +++++++------------ ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 68 +++++++------------ ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 27 +++++++- ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 27 +++++++- ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 27 +++++++- ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 27 +++++++- ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 58 ++++++---------- ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 58 ++++++---------- ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 58 ++++++---------- ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 58 ++++++---------- 20 files changed, 460 insertions(+), 642 deletions(-) diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp index d8b80eb9..f924889f 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,42 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( @@ -89,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp index 5444adc3..ad94eb1f 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 1, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_kn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp index b20c1204..8fb870a0 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp index 2bc3d1f2..aa3e9d32 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 1, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp index a1fe1ddf..a65ae1eb 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> - // clang-format on - >; + = device_contraction_f64_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp index a635bce8..4d6ccaa8 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_kn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp index c77ffea4..071ccf62 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp index c8a96a70..d8223df7 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp index 88345e74..24d2d570 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,42 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp index 38702afd..f559dc06 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/n/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 1, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_kn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp index 735a5e34..a522052d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // m/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp index d286e2d8..be35683b 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // m/n/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 1, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp index 04176d80..dac46620 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp index 06481fc7..0830b49f 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp index 94922008..9a716ba3 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp index e70b854b..e02ac144 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp index f8904a8f..6f168ee2 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> - // clang-format on - >; + = device_contraction_f64_kk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp index 56fc8b91..347a810c 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_kn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp index 231a0256..229d18c7 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp index 4fc648d4..bf1efa14 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, From f85df837f3ae885178b197f8c2435c14e9847a2c Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Mon, 4 Dec 2023 16:29:00 +0000 Subject: [PATCH 12/42] Print C in sample output 1. Initiate the data with 0.01, 0.02, ... by default 2. Print C --- .../simple_bilinear_contraction.hpp | 22 ++++++++++++++----- .../simple_scale_contraction.hpp | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp index aaef4a1b..27001232 100644 --- a/samples/01_contraction/simple_bilinear_contraction.hpp +++ b/samples/01_contraction/simple_bilinear_contraction.hpp @@ -154,7 +154,7 @@ int bilinearContractionSample() /******************* * Initialize data *******************/ - int initMethod = 0; // TODO read value from commandline + int initMethod = 1; // TODO read value from commandline for(int64_t i = 0; i < elementsA; i++) { if(initMethod == 0) @@ -287,11 +287,6 @@ int bilinearContractionSample() bool printElements = false; bool storeElements = false; - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - if(printElements) { if(elementsA < MAX_ELEMENTS_PRINT_COUNT) @@ -314,6 +309,15 @@ int bilinearContractionSample() hiptensorPrintArrayElements(std::cout, C, elementsC); std::cout << std::endl; } + + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } } if(storeElements) @@ -327,6 +331,12 @@ int bilinearContractionSample() hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); tensorB.close(); + tensorC.open("tensor_C.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + tensorC.open("tensor_C_scale_contraction_results.txt"); hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); tensorC.close(); diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp index e9d482c3..78b026b6 100644 --- a/samples/01_contraction/simple_scale_contraction.hpp +++ b/samples/01_contraction/simple_scale_contraction.hpp @@ -151,7 +151,7 @@ int scaleContractionSample() /******************* * Initialize data *******************/ - int initMethod = 0; // TODO read the value from command line + int initMethod = 1; // TODO read the value from command line for(int64_t i = 0; i < elementsA; i++) { if(initMethod == 0) From 5c45a8c80dd0e90171a791bd945c0e41b84ef22d Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Tue, 5 Dec 2023 18:15:54 +0000 Subject: [PATCH 13/42] Set CK contraction instance only run once When logger level is set to HIPTENSOR_LOG_LEVEL_PERF_TRACE, we make CK instances measure the running time. The problem is that CK internally will run the contraction 10 times by default. This leads to an issues: 1. It returns wrong result for C = alpha A x B + beta C Set StreamConfig.nrepeat_ = 1, the contraction will be run once --- library/src/contraction/hiptensor_contraction.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index c7b7501b..8148eeaa 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -720,7 +720,13 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, // Perform contraction with timing if LOG_LEVEL_PERF_TRACE if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE) { - auto time = (*cSolution)(StreamConfig{stream, true}); + auto time = (*cSolution)(StreamConfig{ + stream, // stream id + true, // time_kernel + 0, // log_level + 0, // cold_niters + 1, // nrepeat + }); if(time < 0) { return HIPTENSOR_STATUS_CK_ERROR; From f631818937db143e42d444d4a0c2ce5646ad525e Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Tue, 5 Dec 2023 23:57:33 +0000 Subject: [PATCH 14/42] Fixed a bug in CPU reference 1. ck::bhalf_t cannot cast to float or double by static_cast. Use ck::type_convert() to fix it. 2. epsilon() is not good value to measure the relative difference of data. It is too small for double (eps < 10e-13). --- .../contraction_cpu_reference_impl.hpp | 17 +++++++---------- .../configs/bilinear_test_params.yaml | 2 +- .../configs/scale_test_params.yaml | 2 +- test/utils.hpp | 9 ++++----- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index ac4fc20d..a9a9d176 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -53,7 +53,6 @@ namespace hiptensor typename BDataType, typename DsDataType, typename EDataType, - typename AccDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, @@ -152,7 +151,7 @@ namespace hiptensor }; auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { - auto accum = static_cast(0); + float accum = 0.0f; auto K0 = arg.mA_ms_ks_lengths[2]; auto K1 = arg.mA_ms_ks_lengths[3]; @@ -174,8 +173,7 @@ namespace hiptensor arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]); // Mult / accum - accum - += static_cast(valA) * static_cast(valB); + accum += ck::type_convert(valA) * ck::type_convert(valB); } } @@ -184,15 +182,17 @@ namespace hiptensor if constexpr(std::is_same_v) { - arg.mOpCDE(((EDataType*)arg.mE)[indexE], accum); + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum)); } else // bilinear { // NumDTensor will be 1 due to SFINAE of this class auto indexD = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); - arg.mOpCDE( - ((EDataType*)arg.mE)[indexE], accum, ((EDataType*)(arg.mD[0]))[indexD]); + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum), + ((EDataType*)(arg.mD[0]))[indexD]); } }; @@ -323,7 +323,6 @@ namespace hiptensor typename BDataType, typename DsDataType, typename EDataType, - typename AccumDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, @@ -335,7 +334,6 @@ namespace hiptensor BDataType, DsDataType, EDataType, - AccumDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, @@ -375,7 +373,6 @@ namespace hiptensor BDataType, DsDataType, EDataType, - EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index 08ddf0b2..eee5d7f1 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -11,7 +11,7 @@ Tensor Data Types: Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - - HIPTENSOR_ALGO_ACTOR_CRITIC + # - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index 08ddf0b2..eee5d7f1 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -11,7 +11,7 @@ Tensor Data Types: Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - - HIPTENSOR_ALGO_ACTOR_CRITIC + # - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: diff --git a/test/utils.hpp b/test/utils.hpp index ad4bb565..05daf544 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -140,7 +140,7 @@ template std::pair compareEqual(DDataType const* deviceD, DDataType const* hostD, std::size_t elementsD, - double tolerance = 100.0) + double tolerance = 0.005) { bool retval = true; double max_relative_error = 0.0; @@ -202,7 +202,7 @@ std::pair compareEqual(DDataType const* deviceD, retval = false; max_relative_error = std::numeric_limits::signaling_NaN(); } - else if(max_relative_error > (eps * tolerance)) + else if(max_relative_error > tolerance) { retval = false; } @@ -214,7 +214,7 @@ template std::pair compareEqualLaunchKernel(DDataType* deviceD, DDataType* hostD, std::size_t elementsD, - double tolerance = 100.0) + double tolerance = 0.005) { auto blockDim = dim3(1024, 1, 1); auto gridDim = dim3(ceilDiv(elementsD, blockDim.x), 1, 1); @@ -276,13 +276,12 @@ std::pair compareEqualLaunchKernel(DDataType* deviceD, auto toDouble = [](DDataType const& val) { return static_cast(static_cast(val)); }; - auto eps = toDouble(std::numeric_limits::epsilon()); if(isNaN) { retval = false; maxRelativeError = std::numeric_limits::signaling_NaN(); } - else if(maxRelativeError > (eps * tolerance)) + else if(maxRelativeError > tolerance) { retval = false; } From e5cefe79a7e4630b4e1f07edd425a6cba6fda519 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 6 Dec 2023 01:43:48 +0000 Subject: [PATCH 15/42] Add commnets --- library/src/contraction/contraction_meta_traits.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index 6a7cb35f..e66ac432 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -67,6 +67,14 @@ namespace hiptensor constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; + /* + * CK does not use hip_bfloat16, instead it use ushort(ck::bhalf_t) for cuda bhalf_t type. + * What we want here is that we can use ck::bhalf_t with ck instances and use hip_bfloat16 + * with hiptensor classes. + * + * When creating a solution, ck::bhalf_t was passed in to create ck instance. + * When registering the solution, MetaTraits will returen hip_bfloat16 to create key. + */ using ADataT = std::conditional_t, hip_bfloat16, ADataType>; using BDataT From 4345a1c5b4b32fa427a8880a944895b3947ee6dd Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 6 Dec 2023 17:14:53 +0000 Subject: [PATCH 16/42] Rename contraction sameple files The pattern of contraction sameple file is - bilinear: simple_bilinear_contraction_____compute_.cpp - scale : simple_scale_contraction____compute_.cpp --- samples/01_contraction/CMakeLists.txt | 85 ++++++++++--------- ...tion_bf16_bf16_bf16_bf16_compute_bf16.cpp} | 0 ...ntraction_f16_f16_f16_f16_compute_f16.cpp} | 0 ...traction_f32_f32_f32_f32_compute_bf16.cpp} | 0 ...ntraction_f32_f32_f32_f32_compute_f16.cpp} | 0 ...ntraction_f32_f32_f32_f32_compute_f32.cpp} | 0 ...ntraction_f64_f64_f64_f64_compute_f32.cpp} | 0 ...ntraction_f64_f64_f64_f64_compute_f64.cpp} | 0 ...ntraction_bf16_bf16_bf16_compute_bf16.cpp} | 0 ...e_contraction_f16_f16_f16_compute_f16.cpp} | 0 ..._contraction_f32_f32_f32_compute_bf16.cpp} | 0 ...e_contraction_f32_f32_f32_compute_f16.cpp} | 0 ...e_contraction_f32_f32_f32_compute_f32.cpp} | 0 ...e_contraction_f64_f64_f64_compute_f32.cpp} | 0 ...e_contraction_f64_f64_f64_compute_f64.cpp} | 0 15 files changed, 43 insertions(+), 42 deletions(-) rename samples/01_contraction/{simple_bilinear_contraction_bf16.cpp => simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f16.cpp => simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f32_bf16.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f32_f16.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f32.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f64_f32.cpp => simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f64.cpp => simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_bf16.cpp => simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f16.cpp => simple_scale_contraction_f16_f16_f16_compute_f16.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f32_bf16.cpp => simple_scale_contraction_f32_f32_f32_compute_bf16.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f32_f16.cpp => simple_scale_contraction_f32_f32_f32_compute_f16.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f32.cpp => simple_scale_contraction_f32_f32_f32_compute_f32.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f64_f32.cpp => simple_scale_contraction_f64_f64_f64_compute_f32.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f64.cpp => simple_scale_contraction_f64_f64_f64_compute_f64.cpp} (100%) diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index de834d72..00393f1d 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -26,62 +26,63 @@ # Check whether building within hiptensor context if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) - add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) - add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) - add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) - add_hiptensor_sample(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp) - add_hiptensor_sample(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp) - add_hiptensor_sample(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp) - add_hiptensor_sample(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) - add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) + add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) + add_hiptensor_sample(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) # If building hipTensor samples as a standalone Cmake project else() - add_executable(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) - target_link_libraries(simple_contraction_scale_f16 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp) + target_link_libraries(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) - target_link_libraries(simple_contraction_scale_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp) + target_link_libraries(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) - target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp) - target_link_libraries(simple_contraction_scale_f32_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp) - target_link_libraries(simple_contraction_scale_f32_f16 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp) - target_link_libraries(simple_contraction_scale_f64 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) + target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp) - target_link_libraries(simple_contraction_scale_f64_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) + target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) - target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) + target_link_libraries(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) - target_link_libraries(simple_contraction_bilinear_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp) + target_link_libraries(simple_scale_contraction_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) - target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp) - target_link_libraries(simple_contraction_bilinear_f32_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp) - target_link_libraries(simple_contraction_bilinear_f32_f16 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp) - target_link_libraries(simple_contraction_bilinear_f64 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) + target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) + target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp) - target_link_libraries(simple_contraction_bilinear_f64_f32 PRIVATE hiptensor::hiptensor) endif() diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_bf16.cpp rename to samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f16.cpp rename to samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f32.cpp rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp rename to samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f64.cpp rename to samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_bf16.cpp rename to samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f16.cpp rename to samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f32_bf16.cpp rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f32_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f32_f16.cpp rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f32.cpp rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f64_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f64_f32.cpp rename to samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f64.cpp rename to samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp From 43f33ee5c6b40d0b4278cd1c221399eb99b16a7d Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 6 Dec 2023 21:02:53 +0000 Subject: [PATCH 17/42] Improve CPU reference accurary The relative difference between contraction result and CPU reference is less than 0.1% after the improvement. --- library/src/contraction/contraction_cpu_reference_impl.hpp | 3 ++- test/utils.hpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index a9a9d176..d21df2d3 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -173,7 +173,8 @@ namespace hiptensor arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]); // Mult / accum - accum += ck::type_convert(valA) * ck::type_convert(valB); + accum += ck::type_convert(ck::type_convert( + ck::type_convert(valA) * ck::type_convert(valB))); } } diff --git a/test/utils.hpp b/test/utils.hpp index 05daf544..f39f0fb5 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -140,7 +140,7 @@ template std::pair compareEqual(DDataType const* deviceD, DDataType const* hostD, std::size_t elementsD, - double tolerance = 0.005) + double tolerance = 0.001) { bool retval = true; double max_relative_error = 0.0; @@ -214,7 +214,7 @@ template std::pair compareEqualLaunchKernel(DDataType* deviceD, DDataType* hostD, std::size_t elementsD, - double tolerance = 0.005) + double tolerance = 0.001) { auto blockDim = dim3(1024, 1, 1); auto gridDim = dim3(ceilDiv(elementsD, blockDim.x), 1, 1); From fec9065460d2205f9b9478ccd5f69fa51d2a839e Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 6 Dec 2023 21:19:36 +0000 Subject: [PATCH 18/42] Add comments to explain how to pass alpha value --- library/src/contraction/contraction_selection.cpp | 11 ++++++++++- test/01_contraction/contraction_test.cpp | 15 ++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index 68c748b0..9b0cdf9f 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -71,7 +71,16 @@ namespace hiptensor auto sizeE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides) * hipDataTypeSize(typeE); - void * A_d, *B_d, *D_d, *E_d, *wspace; + void *A_d, *B_d, *D_d, *E_d, *wspace; + + /* + * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha. + * ``` + * alphaF = hiptensor::readVal( + * alpha, convertToComputeType(HipDataType_v)); + * ``` + * Hence, the `alpha` and `bete` need to point to a ComputeData value + */ double alpha = 0.0; double beta = 0.0; writeVal(&alpha, computeType, 1.02); diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index ce67278f..76cc3033 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -491,9 +491,18 @@ namespace hiptensor auto CDataType = testType[2]; auto DDataType = testType[3]; - auto computeType = convertToComputeType(testType[4]); - double alphaBuf = 0.; - double betaBuf = 0.; + auto computeType = convertToComputeType(testType[4]); + + /* + * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha. + * ``` + * alphaF = hiptensor::readVal( + * alpha, convertToComputeType(HipDataType_v)); + * ``` + * Hence, the `alpha` and `bete` need to point to a ComputeData value + */ + double alphaBuf = 0.; + double betaBuf = 0.; writeVal(&alphaBuf, computeType, alpha); writeVal(&betaBuf, computeType, beta); From b21fe0b18881fb6ed5643be7bd2e242f9a4b45a2 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 7 Dec 2023 02:16:22 +0000 Subject: [PATCH 19/42] Update CPU reference 1. Revert the default threshold of relative difference to (100 * std::numeric_limits::epsilon()) 2. Update CPU reference to make the difference between CPU reference and output of contraction instance is less than (100 * std::numeric_limits::epsilon()). --- .../contraction_cpu_reference_impl.hpp | 29 ++++++++++++++----- .../contraction_cpu_reference_instances.cpp | 14 +++++++++ .../configs/bilinear_test_params.yaml | 2 +- .../configs/scale_test_params.yaml | 2 +- test/utils.hpp | 9 +++--- 5 files changed, 42 insertions(+), 14 deletions(-) diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index d21df2d3..2e3d0cbe 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -45,19 +45,25 @@ namespace hiptensor { // hardcoded for NumDimM == NumDimN == NumDimK == 2 + // + // ck::bhalf_t is ushort, cannot perform bhalf_t * bhalf_t + // CK does not use ck::bhalf_t as AccDataType. But we still + // add this guard here template < ck::index_t NumDimM, ck::index_t NumDimN, ck::index_t NumDimK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename ComputeDataType = ADataType, - ck::enable_if_t, bool> = false> struct ReferenceContraction_M2_N2_K2 @@ -151,7 +157,7 @@ namespace hiptensor }; auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { - float accum = 0.0f; + AccDataType accum = 0; auto K0 = arg.mA_ms_ks_lengths[2]; auto K1 = arg.mA_ms_ks_lengths[3]; @@ -165,16 +171,19 @@ namespace hiptensor auto indexB = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); - ADataType valA; - BDataType valB; + AccDataType valA; + AccDataType valB; // Element-wise ops - arg.mOpA(valA, ((ADataType*)arg.mA)[indexA]); - arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]); + arg.mOpA( + valA, + ck::type_convert(((ADataType*)arg.mA)[indexA])); + arg.mOpB( + valB, + ck::type_convert(((BDataType*)arg.mB)[indexB])); // Mult / accum - accum += ck::type_convert(ck::type_convert( - ck::type_convert(valA) * ck::type_convert(valB))); + accum += valA * valB; } } @@ -322,6 +331,7 @@ namespace hiptensor ck::index_t NumDimsK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, typename AElementwiseOperation, @@ -333,6 +343,7 @@ namespace hiptensor NumDimsK, ADataType, BDataType, + AccDataType, DsDataType, EDataType, AElementwiseOperation, @@ -359,6 +370,7 @@ namespace hiptensor ck::index_t NumDimK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, typename AElementwiseOperation, @@ -372,6 +384,7 @@ namespace hiptensor NumDimK, ADataType, BDataType, + AccDataType, DsDataType, EDataType, AElementwiseOperation, diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 173a49e9..31fb0191 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -39,6 +39,7 @@ namespace hiptensor 2, ck::half_t, ck::half_t, + float, ck::Tuple, ck::half_t, ck::tensor_operation::element_wise::PassThrough, @@ -53,6 +54,7 @@ namespace hiptensor 2, ck::bhalf_t, ck::bhalf_t, + float, ck::Tuple, ck::bhalf_t, ck::tensor_operation::element_wise::PassThrough, @@ -67,6 +69,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple, float, ck::tensor_operation::element_wise::PassThrough, @@ -80,6 +83,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple, float, ck::tensor_operation::element_wise::PassThrough, @@ -93,6 +97,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple, float, ck::tensor_operation::element_wise::PassThrough, @@ -107,6 +112,7 @@ namespace hiptensor 2, double, double, + float, ck::Tuple, double, ck::tensor_operation::element_wise::PassThrough, @@ -120,6 +126,7 @@ namespace hiptensor 2, double, double, + double, ck::Tuple, double, ck::tensor_operation::element_wise::PassThrough, @@ -134,6 +141,7 @@ namespace hiptensor 2, ck::half_t, ck::half_t, + float, ck::Tuple<>, ck::half_t, ck::tensor_operation::element_wise::PassThrough, @@ -148,6 +156,7 @@ namespace hiptensor 2, ck::bhalf_t, ck::bhalf_t, + float, ck::Tuple<>, ck::bhalf_t, ck::tensor_operation::element_wise::PassThrough, @@ -162,6 +171,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple<>, float, ck::tensor_operation::element_wise::PassThrough, @@ -175,6 +185,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple<>, float, ck::tensor_operation::element_wise::PassThrough, @@ -188,6 +199,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple<>, float, ck::tensor_operation::element_wise::PassThrough, @@ -202,6 +214,7 @@ namespace hiptensor 2, double, double, + float, ck::Tuple<>, double, ck::tensor_operation::element_wise::PassThrough, @@ -215,6 +228,7 @@ namespace hiptensor 2, double, double, + double, ck::Tuple<>, double, ck::tensor_operation::element_wise::PassThrough, diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index eee5d7f1..f4be1a88 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -29,7 +29,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 1 ] + - [ 24, 18, 2, 4, 9, 2 ] Strides: - [] ... diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index eee5d7f1..f4be1a88 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -29,7 +29,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 1 ] + - [ 24, 18, 2, 4, 9, 2 ] Strides: - [] ... diff --git a/test/utils.hpp b/test/utils.hpp index f39f0fb5..ad4bb565 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -140,7 +140,7 @@ template std::pair compareEqual(DDataType const* deviceD, DDataType const* hostD, std::size_t elementsD, - double tolerance = 0.001) + double tolerance = 100.0) { bool retval = true; double max_relative_error = 0.0; @@ -202,7 +202,7 @@ std::pair compareEqual(DDataType const* deviceD, retval = false; max_relative_error = std::numeric_limits::signaling_NaN(); } - else if(max_relative_error > tolerance) + else if(max_relative_error > (eps * tolerance)) { retval = false; } @@ -214,7 +214,7 @@ template std::pair compareEqualLaunchKernel(DDataType* deviceD, DDataType* hostD, std::size_t elementsD, - double tolerance = 0.001) + double tolerance = 100.0) { auto blockDim = dim3(1024, 1, 1); auto gridDim = dim3(ceilDiv(elementsD, blockDim.x), 1, 1); @@ -276,12 +276,13 @@ std::pair compareEqualLaunchKernel(DDataType* deviceD, auto toDouble = [](DDataType const& val) { return static_cast(static_cast(val)); }; + auto eps = toDouble(std::numeric_limits::epsilon()); if(isNaN) { retval = false; maxRelativeError = std::numeric_limits::signaling_NaN(); } - else if(maxRelativeError > tolerance) + else if(maxRelativeError > (eps * tolerance)) { retval = false; } From 76de7d0b89f1961b0c43c6cab8781565d6f9ad08 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Mon, 11 Dec 2023 20:09:31 +0000 Subject: [PATCH 20/42] Remove xfloat32 which is not used in hiptensor --- .../hiptensor/internal/native_types.hpp | 5 - .../hiptensor/internal/type_traits.hpp | 82 +---- .../include/hiptensor/internal/xfloat32.hpp | 334 ------------------ 3 files changed, 5 insertions(+), 416 deletions(-) delete mode 100644 library/include/hiptensor/internal/xfloat32.hpp diff --git a/library/include/hiptensor/internal/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp index 6c9dbee8..69ce706f 100644 --- a/library/include/hiptensor/internal/native_types.hpp +++ b/library/include/hiptensor/internal/native_types.hpp @@ -33,8 +33,6 @@ #include #include -#include "xfloat32.hpp" - namespace hiptensor { @@ -84,9 +82,6 @@ namespace hiptensor #if !HIPTENSOR_NO_HALF using hfloat16_t = __half; #endif // !HIPTENSOR_NO_HALF - - using xfloat32_t = hiptensor_xfloat32; - // clang-format off diff --git a/library/include/hiptensor/internal/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp index 3867839d..48566051 100644 --- a/library/include/hiptensor/internal/type_traits.hpp +++ b/library/include/hiptensor/internal/type_traits.hpp @@ -26,9 +26,11 @@ #ifndef HIPTENSOR_TYPE_TRAITS_HPP #define HIPTENSOR_TYPE_TRAITS_HPP -#include "native_types.hpp" #include +#include "config.hpp" +#include "native_types.hpp" + namespace hiptensor { namespace detail @@ -69,9 +71,8 @@ namespace hiptensor { union { - uint32_t i32; - float32_t f32; - xfloat32_t xf32; + uint32_t i32; + float32_t f32; }; constexpr Fp32Bits(uint32_t initVal) : i32(initVal) @@ -81,10 +82,6 @@ namespace hiptensor : f32(initVal) { } - constexpr Fp32Bits(xfloat32_t initVal) - : xf32(initVal) - { - } }; } // namespace detail @@ -273,68 +270,6 @@ namespace std hiptensor::detail::Fp16Bits eps(static_cast(0x7FC0)); return eps.b16; } - - /////////////////////////////////////////////////////////// - /////////// std::numeric_limits ////////////// - /////////////////////////////////////////////////////////// - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::epsilon() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_EPSILON)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::infinity() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(HUGE_VALF)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::lowest() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(-FLT_MAX)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::max() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_MAX)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::min() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_MIN)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::quiet_NaN() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(0x7FF80000)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::signaling_NaN() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(0x7FF00000)); - return eps.xf32; - } - // @endcond - } // namespace std namespace hiptensor @@ -378,13 +313,6 @@ namespace hiptensor // b16 mantissa is 7 bits return ((int32_t)1 << 8); } - - template ::value, int> = 0> - constexpr auto maxExactInteger() -> int32_t - { - // xf32 mantissa is 7 bits - return ((int32_t)1 << 8); - } } // namespace hiptensor #endif // HIPTENSOR_TYPE_TRAITS_HPP diff --git a/library/include/hiptensor/internal/xfloat32.hpp b/library/include/hiptensor/internal/xfloat32.hpp deleted file mode 100644 index 6e9168cf..00000000 --- a/library/include/hiptensor/internal/xfloat32.hpp +++ /dev/null @@ -1,334 +0,0 @@ -/* ************************************************************************ - * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- - * ies of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- - * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR - * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- - * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ************************************************************************ */ - -/*!\file - * \brief xfloat32.h provides struct for hiptensor_xfloat32 typedef - */ - -#ifndef HIPTENSOR_XFLOAT32_HPP -#define HIPTENSOR_XFLOAT32_HPP - -#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) - -// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only -// include a minimal definition of hiptensor_xfloat32 - -#include -typedef struct -{ - float data; -} hiptensor_xfloat32; - -#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) - -#include -#include -#include -#include -#include -#include - -#include "config.hpp" - -struct hiptensor_xfloat32 -{ - float data; - - enum round_t - { - round_up - }; - - HIPTENSOR_HOST_DEVICE hiptensor_xfloat32() = default; - - // round upper 19 bits of IEEE float to convert to xfloat32 - explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f, round_t) - : data(float_to_xfloat32(f)) - { - } - - explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f) - : data(truncate_float_to_xfloat32(f)) - { - } - - // zero extend lower 13 bits of xfloat32 to convert to IEEE float - HIPTENSOR_HOST_DEVICE operator float() const - { - return data; - } - - explicit HIPTENSOR_HOST_DEVICE operator bool() const - { - union - { - float fp32; - uint32_t int32; - } u = {data}; - return u.int32 & 0x7fffe000; - } - - explicit HIPTENSOR_HOST_DEVICE operator uint32_t() const - { - return uint32_t(float(*this)); - } - - explicit HIPTENSOR_HOST_DEVICE operator long() const - { - return long(float(*this)); - } - - explicit HIPTENSOR_HOST_DEVICE operator double() const - { - return double(float(*this)); - } - -private: - static HIPTENSOR_HOST_DEVICE float float_to_xfloat32(float f) - { - union - { - float fp32; - uint32_t int32; - } u = {f}; - if(~u.int32 & 0x7f800000) - { - // When the exponent bits are not all 1s, then the value is zero, normal, - // or subnormal. We round the xfloat32 mantissa up by adding 0xFFF, plus - // 1 if the least significant bit of the xfloat32 mantissa is 1 (odd). - // This causes the xfloat32's mantissa to be incremented by 1 if the 13 - // least significant bits of the float mantissa are greater than 0x1000, - // or if they are equal to 0x1000 and the least significant bit of the - // xfloat32 mantissa is 1 (odd). This causes it to be rounded to even when - // the lower 13 bits are exactly 0x1000. If the xfloat32 mantissa already - // has the value 0x3ff, then incrementing it causes it to become 0x00 and - // the exponent is incremented by one, which is the next higher FP value - // to the unrounded xfloat32 value. When the xfloat32 value is subnormal - // with an exponent of 0x00 and a mantissa of 0x3FF, it may be rounded up - // to a normal value with an exponent of 0x01 and a mantissa of 0x00. - // When the xfloat32 value has an exponent of 0xFE and a mantissa of 0x3FF, - // incrementing it causes it to become an exponent of 0xFF and a mantissa - // of 0x00, which is Inf, the next higher value to the unrounded value. - - u.int32 += 0xfff + ((u.int32 >> 13) & 1); // Round to nearest, round to even - } - else if(u.int32 & 0x1fff) - { - // When all of the exponent bits are 1, the value is Inf or NaN. - // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero - // mantissa bit. Quiet NaN is indicated by the most significant mantissa - // bit being 1. Signaling NaN is indicated by the most significant - // mantissa bit being 0 but some other bit(s) being 1. If any of the - // lower 13 bits of the mantissa are 1, we set the least significant bit - // of the xfloat32 mantissa, in order to preserve signaling NaN in case - // the xfloat32's mantissa bits are all 0. - u.int32 |= 0x2000; // Preserve signaling NaN - } - - u.int32 &= 0xffffe000; - return u.fp32; - } - - // Truncate instead of rounding - static HIPTENSOR_HOST_DEVICE float truncate_float_to_xfloat32(float f) - { - union - { - float fp32; - uint32_t int32; - } u = {f}; - - u.int32 = u.int32 & 0xffffe000; - return u.fp32; - } -}; - -typedef struct -{ - float data; -} hiptensor_xfloat32_public; - -static_assert(std::is_standard_layout{}, - "hiptensor_xfloat32 is not a standard layout type, and thus is " - "incompatible with C."); - -static_assert(std::is_trivial{}, - "hiptensor_xfloat32 is not a trivial type, and thus is " - "incompatible with C."); - -static_assert(sizeof(hiptensor_xfloat32) == sizeof(hiptensor_xfloat32_public) - && offsetof(hiptensor_xfloat32, data) - == offsetof(hiptensor_xfloat32_public, data), - "internal hiptensor_xfloat32 does not match public hiptensor_xfloat32"); - -inline std::ostream& operator<<(std::ostream& os, const hiptensor_xfloat32& xf32) -{ - return os << float(xf32); -} - -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a) -{ - return a; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a) -{ - union - { - float fp32; - uint32_t int32; - } u = {a.data}; - u.int32 ^= 0x80000000; - return hiptensor_xfloat32(u.fp32); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a, - hiptensor_xfloat32 b) -{ - return hiptensor_xfloat32(float(a) + float(b)); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a, - hiptensor_xfloat32 b) -{ - return hiptensor_xfloat32(float(a) - float(b)); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator*(hiptensor_xfloat32 a, - hiptensor_xfloat32 b) -{ - return hiptensor_xfloat32(float(a) * float(b)); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator/(hiptensor_xfloat32 a, - hiptensor_xfloat32 b) -{ - return hiptensor_xfloat32(float(a) / float(b)); -} -inline HIPTENSOR_HOST_DEVICE bool operator<(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return float(a) < float(b); -} -inline HIPTENSOR_HOST_DEVICE bool operator==(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return float(a) == float(b); -} -inline HIPTENSOR_HOST_DEVICE bool operator>(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return b < a; -} -inline HIPTENSOR_HOST_DEVICE bool operator<=(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return !(a > b); -} -inline HIPTENSOR_HOST_DEVICE bool operator!=(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return !(a == b); -} -inline HIPTENSOR_HOST_DEVICE bool operator>=(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return !(a < b); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator+=(hiptensor_xfloat32& a, - hiptensor_xfloat32 b) -{ - return a = a + b; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator-=(hiptensor_xfloat32& a, - hiptensor_xfloat32 b) -{ - return a = a - b; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator*=(hiptensor_xfloat32& a, - hiptensor_xfloat32 b) -{ - return a = a * b; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator/=(hiptensor_xfloat32& a, - hiptensor_xfloat32 b) -{ - return a = a / b; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator++(hiptensor_xfloat32& a) -{ - return a += hiptensor_xfloat32(1.0f); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator--(hiptensor_xfloat32& a) -{ - return a -= hiptensor_xfloat32(1.0f); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator++(hiptensor_xfloat32& a, int) -{ - hiptensor_xfloat32 orig = a; - ++a; - return orig; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator--(hiptensor_xfloat32& a, int) -{ - hiptensor_xfloat32 orig = a; - --a; - return orig; -} - -namespace std -{ - constexpr HIPTENSOR_HOST_DEVICE bool isinf(hiptensor_xfloat32 a) - { - union - { - float fp32; - uint32_t int32; - } u = {a.data}; - return !(~u.int32 & 0x7f800000) && !(u.int32 & 0x7fe000); - } - constexpr HIPTENSOR_HOST_DEVICE bool isnan(hiptensor_xfloat32 a) - { - union - { - float fp32; - uint32_t int32; - } u = {a.data}; - return !(~u.int32 & 0x7f800000) && +(u.int32 & 0x7fe000); - } - constexpr HIPTENSOR_HOST_DEVICE bool iszero(hiptensor_xfloat32 a) - { - union - { - float fp32; - uint32_t int32; - } u = {a.data}; - return (u.fp32 == 0.0f); - } - - HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 sin(hiptensor_xfloat32 a) - { - return hiptensor_xfloat32(sinf(float(a))); - } - HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 cos(hiptensor_xfloat32 a) - { - return hiptensor_xfloat32(cosf(float(a))); - } - - HIPTENSOR_HOST_DEVICE constexpr hiptensor_xfloat32 real(const hiptensor_xfloat32& a) - { - return a; - } -} - -#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) - -#endif // HIPTENSOR_XFLOAT32_HPP From 28fe756bb3858eb817540048edbe002c1c43c8f6 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Tue, 12 Dec 2023 12:44:25 +0100 Subject: [PATCH 21/42] fix build warnings removed double std namespaces fix underlines --- .gitignore | 1 + docs/Contributors_Guide.rst | 2 +- docs/Programmers_Guide.rst | 24 +++---- .../internal/hiptensor-version.hpp.in | 9 +++ .../hiptensor/internal/type_traits.hpp | 63 +++++++++++++++++++ .../contraction/contraction_solution_impl.hpp | 6 +- .../contraction_solution_params_impl.hpp | 4 +- 7 files changed, 91 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index ad44a303..674c60bc 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,4 @@ _templates/ _toc.yml docBin/ _doxygen/ +.venv diff --git a/docs/Contributors_Guide.rst b/docs/Contributors_Guide.rst index d75a884b..212248be 100644 --- a/docs/Contributors_Guide.rst +++ b/docs/Contributors_Guide.rst @@ -30,7 +30,7 @@ The hipTensor repository follows a workflow which dictates a /master branch wher the compute bound limit or memory bound limit. Style Guide -========== +=========== This project follows the `CPP Core guidelines `__, diff --git a/docs/Programmers_Guide.rst b/docs/Programmers_Guide.rst index 1eaf9adf..047c1f5a 100644 --- a/docs/Programmers_Guide.rst +++ b/docs/Programmers_Guide.rst @@ -17,13 +17,13 @@ The `library` directory ^^^^^^^^^^^^^^^^^^^^^^^ `library/include/hiptensor/` -''''''''''''''''''''''''''' +'''''''''''''''''''''''''''' Contains C++ include files for the hipTensor API. These files also contain Doxygen comments that document the API. `library/include/hiptensor/internal` -'''''''''''''''''''''''''''''''''' +'''''''''''''''''''''''''''''''''''' Internal include files for: @@ -31,30 +31,30 @@ Internal include files for: - Generate Tensor Utility `library/src/` -'''''''''''' +'''''''''''''' Contains logger, device and performance functions. `library/src/contraction/` -'''''''''''''''''''''''' +'''''''''''''''''''''''''' Contains hipTensor core composable kernel header functions and contraction initialization functions. `library/src/contraction/device` -'''''''''''''''''''''''''''''' +'''''''''''''''''''''''''''''''' Contains hipTensor Bilinear and Scale instance functions The `samples` directory ^^^^^^^^^^^^^^^^^^^^^^^ `01_contraction/simple_bilinear_contraction_f32.cpp` -'''''''''''''''''''''''''''''''''''''''''''''''''' +'''''''''''''''''''''''''''''''''''''''''''''''''''' sample code for calling bilinear contraction for :code:`fp32` input, output and compute types `01_contraction/simple_scale_contraction_f32.cpp` -''''''''''''''''''''''''''''''''''''''''''''''' +''''''''''''''''''''''''''''''''''''''''''''''''' sample code for calling scale contraction for :code:`fp32` input, output and compute types @@ -62,27 +62,27 @@ The `test` directory ^^^^^^^^^^^^^^^^^^^^^^^ `00_unit/logger` -'''''''''''''' +'''''''''''''''' Test code for testing logger API Functions of hipTensor `01_contraction/bilinear_contraction_f32` -''''''''''''''''''''''''''''''''''''''' +''''''''''''''''''''''''''''''''''''''''' Test code for testing the bilinear contraction functionality and log metrics for F32 types. `01_contraction/bilinear_contraction_f64` -''''''''''''''''''''''''''''''''''''''' +''''''''''''''''''''''''''''''''''''''''' Test code for testing the bilinear contraction functionality and log metrics for F64 types. `01_contraction/scale_contraction_f32` -'''''''''''''''''''''''''''''''''''' +'''''''''''''''''''''''''''''''''''''' Test code for testing the scale contraction functionality and log metrics for F32 types. `01_contraction/scale_contraction_f64` -'''''''''''''''''''''''''''''''''''' +'''''''''''''''''''''''''''''''''''''' Test code for testing the scale contraction functionality and log metrics for F64 types. diff --git a/library/include/hiptensor/internal/hiptensor-version.hpp.in b/library/include/hiptensor/internal/hiptensor-version.hpp.in index e1942a2b..89247375 100644 --- a/library/include/hiptensor/internal/hiptensor-version.hpp.in +++ b/library/include/hiptensor/internal/hiptensor-version.hpp.in @@ -38,6 +38,15 @@ #define HIPTENSOR_PATCH_VERSION @hiptensor_VERSION_PATCH@ // clang-format on +/** + * \brief Returns the version number of hipTensor + * + * \details Return the version with three least significant digits for patch version, + * the next three digits for minor version, and the most significant digits for major version. + * + * \returns The version number. + */ + inline size_t hiptensorGetVersion() { return HIPTENSOR_MAJOR_VERSION * 1e6 + HIPTENSOR_MINOR_VERSION * 1e3 + HIPTENSOR_PATCH_VERSION; diff --git a/library/include/hiptensor/internal/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp index 48566051..7735a5c4 100644 --- a/library/include/hiptensor/internal/type_traits.hpp +++ b/library/include/hiptensor/internal/type_traits.hpp @@ -93,6 +93,7 @@ namespace std /////////// std::numeric_limits ////////////// /////////////////////////////////////////////////////////// +#ifndef DOXYGEN_SHOULD_SKIP_THIS template <> HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t numeric_limits::epsilon() noexcept @@ -270,6 +271,68 @@ namespace std hiptensor::detail::Fp16Bits eps(static_cast(0x7FC0)); return eps.b16; } + + /////////////////////////////////////////////////////////// + /////////// std::numeric_limits ////////////// + /////////////////////////////////////////////////////////// + + template <> + HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t + numeric_limits::epsilon() noexcept + { + hiptensor::detail::Fp32Bits eps(static_cast(FLT_EPSILON)); + return eps.xf32; + } + + template <> + HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t + numeric_limits::infinity() noexcept + { + hiptensor::detail::Fp32Bits eps(static_cast(HUGE_VALF)); + return eps.xf32; + } + + template <> + HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t + numeric_limits::lowest() noexcept + { + hiptensor::detail::Fp32Bits eps(static_cast(-FLT_MAX)); + return eps.xf32; + } + + template <> + HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t + numeric_limits::max() noexcept + { + hiptensor::detail::Fp32Bits eps(static_cast(FLT_MAX)); + return eps.xf32; + } + + template <> + HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t + numeric_limits::min() noexcept + { + hiptensor::detail::Fp32Bits eps(static_cast(FLT_MIN)); + return eps.xf32; + } + + template <> + HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t + numeric_limits::quiet_NaN() noexcept + { + hiptensor::detail::Fp32Bits eps(static_cast(0x7FF80000)); + return eps.xf32; + } + + template <> + HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t + numeric_limits::signaling_NaN() noexcept + { + hiptensor::detail::Fp32Bits eps(static_cast(0x7FF00000)); + return eps.xf32; + } +#endif // DOXYGEN_SHOULD_SKIP_THIS + // @endcond } // namespace std namespace hiptensor diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 3b672fbb..263937c3 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -35,11 +35,11 @@ namespace std { template <> - struct std::hash + struct hash { - std::size_t operator()(hiptensor::ContractionSolution const& s) const noexcept + size_t operator()(hiptensor::ContractionSolution const& s) const noexcept { - return std::hash{}(*s.params()); + return hash{}(*s.params()); } }; } diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp index b84f9c2b..3abcaede 100644 --- a/library/src/contraction/contraction_solution_params_impl.hpp +++ b/library/src/contraction/contraction_solution_params_impl.hpp @@ -35,9 +35,9 @@ namespace std { template <> - struct std::hash + struct hash { - std::size_t operator()(hiptensor::ContractionSolutionParams const& s) const noexcept + size_t operator()(hiptensor::ContractionSolutionParams const& s) const noexcept { return hiptensor::Hash{}(s.dimsM(), s.dimsN(), From c18335a1e81d0829f873d89c1a9b03544aed3c22 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Tue, 12 Dec 2023 16:44:55 +0100 Subject: [PATCH 22/42] update doxyfile --- docs/.doxygen/Doxyfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/.doxygen/Doxyfile b/docs/.doxygen/Doxyfile index 59a973b7..136d3b8c 100644 --- a/docs/.doxygen/Doxyfile +++ b/docs/.doxygen/Doxyfile @@ -2074,7 +2074,8 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = __device__ +PREDEFINED = __device__ \ + DOXYGEN_SHOULD_SKIP_THIS # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The From 7ac3fb965aed3862488b0498d6a0f8a0c33e2eb4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Dec 2023 08:58:43 -0700 Subject: [PATCH 23/42] Bump cryptography from 41.0.4 to 41.0.6 in /docs/.sphinx (#162) Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.4 to 41.0.6. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/41.0.4...41.0.6) --- updated-dependencies: - dependency-name: cryptography dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index b339d3e7..ce13fde5 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -26,7 +26,7 @@ charset-normalizer==3.1.0 # via requests click==8.1.3 # via sphinx-external-toc -cryptography==41.0.4 +cryptography==41.0.6 # via pyjwt deprecated==1.2.13 # via pygithub From b03e4f3bdfd595f5341d9da8226d9ac0b8e64b1c Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Wed, 13 Dec 2023 09:57:07 +0100 Subject: [PATCH 24/42] remove type traits, that sneaked back during rebase --- .../hiptensor/internal/type_traits.hpp | 60 ------------------- 1 file changed, 60 deletions(-) diff --git a/library/include/hiptensor/internal/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp index 7735a5c4..d1329498 100644 --- a/library/include/hiptensor/internal/type_traits.hpp +++ b/library/include/hiptensor/internal/type_traits.hpp @@ -271,66 +271,6 @@ namespace std hiptensor::detail::Fp16Bits eps(static_cast(0x7FC0)); return eps.b16; } - - /////////////////////////////////////////////////////////// - /////////// std::numeric_limits ////////////// - /////////////////////////////////////////////////////////// - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::epsilon() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_EPSILON)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::infinity() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(HUGE_VALF)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::lowest() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(-FLT_MAX)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::max() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_MAX)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::min() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_MIN)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::quiet_NaN() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(0x7FF80000)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::signaling_NaN() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(0x7FF00000)); - return eps.xf32; - } #endif // DOXYGEN_SHOULD_SKIP_THIS // @endcond } // namespace std From ded69b930ac83a2126061f71d7dcda6ae2c6d6a7 Mon Sep 17 00:00:00 2001 From: Bence Parajdi Date: Thu, 14 Dec 2023 09:30:28 +0100 Subject: [PATCH 25/42] remove unnecessary endcond --- library/include/hiptensor/internal/type_traits.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/library/include/hiptensor/internal/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp index d1329498..81bafacd 100644 --- a/library/include/hiptensor/internal/type_traits.hpp +++ b/library/include/hiptensor/internal/type_traits.hpp @@ -272,7 +272,6 @@ namespace std return eps.b16; } #endif // DOXYGEN_SHOULD_SKIP_THIS - // @endcond } // namespace std namespace hiptensor From 0b34d47ca7ba6848d71867142c76cf27b2c88d27 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 20 Dec 2023 10:39:01 -0700 Subject: [PATCH 26/42] Bump rocm-docs-core from 0.30.1 to 0.30.2 in /docs/.sphinx (#171) Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.30.1 to 0.30.2. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.30.1...v0.30.2) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index ce13fde5..454c8157 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -100,7 +100,7 @@ requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core==0.30.1 +rocm-docs-core==0.30.2 # via -r requirements.in smmap==5.0.0 # via gitdb From 95af3c14c9950e76aafb852c0f41398eebf4abf9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 20 Dec 2023 18:08:17 +0000 Subject: [PATCH 27/42] Bump rocm-docs-core from 0.30.2 to 0.30.3 in /docs/.sphinx Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.30.2 to 0.30.3. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.30.2...v0.30.3) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 454c8157..41817110 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -100,7 +100,7 @@ requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core==0.30.2 +rocm-docs-core==0.30.3 # via -r requirements.in smmap==5.0.0 # via gitdb From e41eda6d29f7a6649b1be2a915df1bd41093edab Mon Sep 17 00:00:00 2001 From: Sam Wu Date: Tue, 2 Jan 2024 13:28:56 -0700 Subject: [PATCH 28/42] Standardize documentation for ReadtheDocs (#176) --- .github/dependabot.yml | 2 +- .gitignore | 11 ----------- .readthedocs.yaml | 6 ++---- README.md | 21 ++++++++++++-------- docs/.gitignore | 12 +++++------- docs/.sphinx/requirements.in | 1 - docs/conf.py | 24 +++++++++++++++++++++-- docs/{.doxygen => doxygen}/Doxyfile | 7 ++++--- docs/license.rst | 4 ++++ docs/{.sphinx => sphinx}/_toc.yml.in | 3 +++ docs/sphinx/requirements.in | 1 + docs/{.sphinx => sphinx}/requirements.txt | 4 +--- 12 files changed, 56 insertions(+), 40 deletions(-) delete mode 100644 docs/.sphinx/requirements.in rename docs/{.doxygen => doxygen}/Doxyfile (99%) create mode 100644 docs/license.rst rename docs/{.sphinx => sphinx}/_toc.yml.in (84%) create mode 100644 docs/sphinx/requirements.in rename docs/{.sphinx => sphinx}/requirements.txt (98%) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 95e8b2ba..0e0a252e 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -6,7 +6,7 @@ version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values - directory: "/docs/.sphinx" # Location of package manifests + directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" diff --git a/.gitignore b/.gitignore index 674c60bc..9945a9dc 100644 --- a/.gitignore +++ b/.gitignore @@ -50,14 +50,3 @@ build* \#*\# *~ *.log - -# documentation artifacts -build/ -_build/ -_images/ -_static/ -_templates/ -_toc.yml -docBin/ -_doxygen/ -.venv diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e2bf130c..9e6678ab 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,11 +10,9 @@ formats: [htmlzip, pdf, epub] python: install: - - requirements: docs/.sphinx/requirements.txt + - requirements: docs/sphinx/requirements.txt build: - os: ubuntu-20.04 + os: ubuntu-22.04 tools: python: "3.8" - apt_packages: - - "doxygen" diff --git a/README.md b/README.md index f5e55943..5af7912d 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Run the steps below to build documentation locally. ```shell cd docs -pip3 install -r .sphinx/requirements.txt +pip3 install -r sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` @@ -98,21 +98,24 @@ After configuration, build with `cmake --build -- -j` ### Logger tests Tests API implementation of logger verbosity and functionality. -o /bin/logger_test + +* `/bin/logger_test` ## Running Contraction Tests ### Bilinear contraction tests Tests the API implementation of bilinear contraction algorithm with validation. -o /bin/bilinear_contraction_f32_test -o /bin/bilinear_contraction_f64_test + +* `/bin/bilinear_contraction_f32_test` +* `/bin/bilinear_contraction_f64_test` ### Scale contraction tests Tests the API implementation of scale contraction algorithm with validation. -o /bin/scale_contraction_f32_test -o /bin/scale_contraction_f64_test + +* `/bin/scale_contraction_f32_test` +* `/bin/scale_contraction_f64_test` ### Samples @@ -121,12 +124,14 @@ These are stand-alone use-cases of the hipTensor contraction operations. ## F32 Bilinear contraction Demonstrates the API implementation of bilinear contraction operation without validation. -o /bin/simple_contraction_bilinear_f32 + +* `/bin/simple_contraction_bilinear_f32` ## F32 Scale contraction Demonstrates the API implementation of scale contraction operation without validation. -o /bin/simple_contraction_scale_f32 + +* `/bin/simple_contraction_scale_f32` ### Build Samples as external client diff --git a/docs/.gitignore b/docs/.gitignore index a44ccbe0..594c0c8c 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,7 +1,5 @@ -.doxygen/docBin -.sphinx/_toc.yml -_build -_doxygen -_images -_static -_templates \ No newline at end of file +doxygen/html +doxygen/xml +sphinx/_toc.yml +_build/ +_doxygen/ diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in deleted file mode 100644 index 313c5e94..00000000 --- a/docs/.sphinx/requirements.in +++ /dev/null @@ -1 +0,0 @@ -rocm-docs-core>=0.24.0 diff --git a/docs/conf.py b/docs/conf.py index 4f00fb9e..e7e64d90 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,11 +29,31 @@ # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html +import re + from rocm_docs import ROCmDocs -docs_core = ROCmDocs("hipTensor Documentation") -docs_core.run_doxygen() +with open('../CMakeLists.txt', encoding='utf-8') as f: + match = re.search(r'.*\bset \( VERSION_STRING\s+\"?([0-9.]+)[^0-9.]+', f.read()) + if not match: + raise ValueError("VERSION not found!") + version_number = match[1] +left_nav_title = f"hipTensor {version_number} Documentation" + +# for PDF output on Read the Docs +project = "hipTensor Documentation" +author = "Advanced Micro Devices, Inc." +copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." +version = version_number +release = version_number + +external_toc_path = "./sphinx/_toc.yml" + +docs_core = ROCmDocs(left_nav_title) +docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") docs_core.setup() +external_projects_current_project = "hiptensor" + for sphinx_var in ROCmDocs.SPHINX_VARS: globals()[sphinx_var] = getattr(docs_core, sphinx_var) diff --git a/docs/.doxygen/Doxyfile b/docs/doxygen/Doxyfile similarity index 99% rename from docs/.doxygen/Doxyfile rename to docs/doxygen/Doxyfile index 136d3b8c..6f96968a 100644 --- a/docs/.doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = docBin +OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and @@ -786,7 +786,8 @@ WARN_AS_ERROR = YES INPUT = ../../library/include/hiptensor \ ../../library/include/hiptensor/internal \ - ../../library/src + ../../library/src \ + ../../README.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -965,7 +966,7 @@ FILTER_SOURCE_PATTERNS = # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. -USE_MDFILE_AS_MAINPAGE = ../README.md +USE_MDFILE_AS_MAINPAGE = ../../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing diff --git a/docs/license.rst b/docs/license.rst new file mode 100644 index 00000000..141b5d3c --- /dev/null +++ b/docs/license.rst @@ -0,0 +1,4 @@ +License +======= + +.. include:: ../LICENSE diff --git a/docs/.sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in similarity index 84% rename from docs/.sphinx/_toc.yml.in rename to docs/sphinx/_toc.yml.in index 37b5a62b..6da76c27 100644 --- a/docs/.sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -8,3 +8,6 @@ subtrees: - file: API_Reference_Guide - file: Programmers_Guide - file: Contributors_Guide + - caption: About + entries: + - file: license diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in new file mode 100644 index 00000000..b80af261 --- /dev/null +++ b/docs/sphinx/requirements.in @@ -0,0 +1 @@ +rocm-docs-core==0.30.3 diff --git a/docs/.sphinx/requirements.txt b/docs/sphinx/requirements.txt similarity index 98% rename from docs/.sphinx/requirements.txt rename to docs/sphinx/requirements.txt index 41817110..81f0b559 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -84,9 +84,7 @@ pygments==2.15.0 # pydata-sphinx-theme # sphinx pyjwt[crypto]==2.6.0 - # via - # pygithub - # pyjwt + # via pygithub pynacl==1.5.0 # via pygithub pytz==2023.3.post1 From 749644231a57063188ddf90791668619c0170200 Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Mon, 11 Dec 2023 17:49:52 -0500 Subject: [PATCH 29/42] Add API changes - Add API changes - Add test files - Add samples - Add cpu contraction for complex types - Add complex instances --- .../hiptensor/internal/hiptensor_utility.hpp | 15 + .../contraction_cpu_reference_impl.hpp | 169 +++-- .../contraction_cpu_reference_instances.cpp | 60 ++ .../src/contraction/contraction_pack_util.hpp | 101 +++ .../contraction_solution_instances.cpp | 53 ++ library/src/contraction/device/CMakeLists.txt | 2 + .../device_contraction_bilinear_complex.hpp | 595 +++++++++++++++ ...ffle_cf32_cf32_cf32_cf32_kknn_instance.cpp | 105 +++ .../device_contraction_scale_complex.hpp | 699 ++++++++++++++++++ ..._c_shuffle_cf32_cf32_cf32_kkn_instance.cpp | 105 +++ library/src/data_types.cpp | 16 +- library/src/hiptensor.cpp | 3 +- library/src/include/data_types.hpp | 1 + library/src/include/data_types_impl.hpp | 20 + samples/01_contraction/CMakeLists.txt | 2 + ...action_cf32_cf32_cf32_cf32_compute_f32.cpp | 57 ++ ...contraction_cf32_cf32_cf32_compute_f32.cpp | 57 ++ test/00_unit/yaml_test.cpp | 4 + test/01_contraction/contraction_test.cpp | 88 ++- test/device/common.hpp | 17 +- test/llvm/yaml_parser_config.cpp | 2 + 21 files changed, 2104 insertions(+), 67 deletions(-) create mode 100644 library/src/contraction/contraction_pack_util.hpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_complex.hpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_complex.hpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp index c386bbe0..746f1bbf 100644 --- a/library/include/hiptensor/internal/hiptensor_utility.hpp +++ b/library/include/hiptensor/internal/hiptensor_utility.hpp @@ -29,6 +29,7 @@ #include #include #include +#include #include "../hiptensor_types.hpp" #include "types_ext.hpp" @@ -61,6 +62,20 @@ } #endif +inline std::ostream& operator<<(std::ostream& os, const hipFloatComplex& fc) +{ + std::string seperator = (hipCimagf(fc) >= 0) ? " + " : ""; + + return os << hipCrealf(fc) << seperator << hipCimagf(fc) << "i"; +} + +inline std::ostream& operator<<(std::ostream& os, const hipDoubleComplex& dc) +{ + std::string seperator = (hipCimag(dc) >= 0) ? " + " : ""; + + return os << hipCreal(dc) << seperator << hipCimag(dc) << "i"; +} + template void hiptensorPrintArrayElements(std::ostream& stream, T* vec, size_t size) { diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index 2e3d0cbe..25c317e3 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -156,62 +156,127 @@ namespace hiptensor indices.begin(), indices.end(), strides.begin(), std::size_t{0}); }; - auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { - AccDataType accum = 0; + if constexpr((std::is_same_v && + std::is_same_v && + std::is_same_v) || + (std::is_same_v && + std::is_same_v && + std::is_same_v)) + { + auto f_ms_ns_complex = [&](auto m0, auto m1, auto n0, auto n1) { + HIP_vector_type accum{0}; + + auto K0 = arg.mA_ms_ks_lengths[2]; + auto K1 = arg.mA_ms_ks_lengths[3]; + + for(size_t k0 = 0; k0 < K0; k0++) + { + for(size_t k1 = 0; k1 < K1; k1++) + { + auto indexA + = offset(std::vector{m0, m1, k0, k1}, arg.mA_ms_ks_strides); + auto indexB + = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); + + ADataType valA = ((ADataType*)arg.mA)[indexA]; + BDataType valB = ((BDataType*)arg.mB)[indexB]; + + // Mult / accum + if constexpr(std::is_same_v) + { + accum = hipCaddf(accum, hipCmulf(valA, valB)); + } + else if constexpr(std::is_same_v) + { + accum = hipCadd(accum, hipCmul(valA, valB)); + } + } + } + + auto indexE = offset(std::vector{m0, m1, n0, n1}, arg.mE_ms_ns_strides); + + if constexpr(std::is_same_v) + { + ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.scale_ * (EDataType)accum; + } + else // bilinear + { + // NumDTensor will be 1 due to SFINAE of this class + auto indexD + = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); + + ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.alpha_ * (EDataType)accum + + arg.mOpCDE.beta_ * ((EDataType*)(arg.mD[0]))[indexD]; + } + }; + + make_ParallelTensorFunctor(f_ms_ns_complex, + arg.mE_ms_ns_lengths[0], + arg.mE_ms_ns_lengths[1], + arg.mE_ms_ns_lengths[2], + arg.mE_ms_ns_lengths[3])( + std::thread::hardware_concurrency()); + } + else + { + auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { + AccDataType accum = 0; + + auto K0 = arg.mA_ms_ks_lengths[2]; + auto K1 = arg.mA_ms_ks_lengths[3]; + + for(size_t k0 = 0; k0 < K0; k0++) + { + for(size_t k1 = 0; k1 < K1; k1++) + { + auto indexA + = offset(std::vector{m0, m1, k0, k1}, arg.mA_ms_ks_strides); + auto indexB + = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); + + AccDataType valA; + AccDataType valB; + + // Element-wise ops + arg.mOpA( + valA, + ck::type_convert(((ADataType*)arg.mA)[indexA])); + arg.mOpB( + valB, + ck::type_convert(((BDataType*)arg.mB)[indexB])); + + // Mult / accum + accum += valA * valB; + } + } - auto K0 = arg.mA_ms_ks_lengths[2]; - auto K1 = arg.mA_ms_ks_lengths[3]; + auto indexE = offset(std::vector{m0, m1, n0, n1}, arg.mE_ms_ns_strides); - for(size_t k0 = 0; k0 < K0; k0++) - { - for(size_t k1 = 0; k1 < K1; k1++) + if constexpr(std::is_same_v) { - auto indexA - = offset(std::vector{m0, m1, k0, k1}, arg.mA_ms_ks_strides); - auto indexB - = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); - - AccDataType valA; - AccDataType valB; - - // Element-wise ops - arg.mOpA( - valA, - ck::type_convert(((ADataType*)arg.mA)[indexA])); - arg.mOpB( - valB, - ck::type_convert(((BDataType*)arg.mB)[indexB])); - - // Mult / accum - accum += valA * valB; + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum)); } - } - - auto indexE = offset(std::vector{m0, m1, n0, n1}, arg.mE_ms_ns_strides); - - if constexpr(std::is_same_v) - { - arg.mOpCDE(((EDataType*)arg.mE)[indexE], - ck::type_convert(accum)); - } - else // bilinear - { - // NumDTensor will be 1 due to SFINAE of this class - auto indexD - = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); - arg.mOpCDE(((EDataType*)arg.mE)[indexE], - ck::type_convert(accum), - ((EDataType*)(arg.mD[0]))[indexD]); - } - }; - - make_ParallelTensorFunctor(f_ms_ns, - arg.mE_ms_ns_lengths[0], - arg.mE_ms_ns_lengths[1], - arg.mE_ms_ns_lengths[2], - arg.mE_ms_ns_lengths[3])( - std::thread::hardware_concurrency()); + else // bilinear + { + // NumDTensor will be 1 due to SFINAE of this class + auto indexD + = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum), + ((EDataType*)(arg.mD[0]))[indexD]); + } + }; + + make_ParallelTensorFunctor(f_ms_ns, + arg.mE_ms_ns_lengths[0], + arg.mE_ms_ns_lengths[1], + arg.mE_ms_ns_lengths[2], + arg.mE_ms_ns_lengths[3])( + std::thread::hardware_concurrency()); + } return 0; } diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 31fb0191..68b4ad1b 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -105,6 +105,21 @@ namespace hiptensor ck::tensor_operation::element_wise::Bilinear, float>()); + // Bilinear complex f32 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + float, + ck::Tuple, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f64 registerSolutions( enumerateReferenceSolutions<2, @@ -134,6 +149,21 @@ namespace hiptensor ck::tensor_operation::element_wise::Bilinear, double>()); + // Bilinear complex f64 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + double, + ck::Tuple, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + double>()); + // Scale f16 registerSolutions( enumerateReferenceSolutions<2, @@ -207,6 +237,21 @@ namespace hiptensor ck::tensor_operation::element_wise::Scale, float>()); + // Scale complex f32 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + float, + ck::Tuple<>, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + // Scale f64 registerSolutions( enumerateReferenceSolutions<2, @@ -235,5 +280,20 @@ namespace hiptensor ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Scale, double>()); + + // Scale complex f64 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + double, + ck::Tuple<>, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + double>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp new file mode 100644 index 00000000..49741547 --- /dev/null +++ b/library/src/contraction/contraction_pack_util.hpp @@ -0,0 +1,101 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#ifndef HIPTENSOR_CONTRACTION_PACK_UTIL_HPP +#define HIPTENSOR_CONTRACTION_PACK_UTIL_HPP + +#include "data_types.hpp" +#include "util.hpp" +#include + +namespace hiptensor +{ + /** + * \brief This function unpacks structured data (hipFloatComplex / hipDoubleComplex) + * into non-structured data (float / double). + */ + template + __global__ void unpack(const InputType* in, OutputType* out_real, OutputType *out_img, int length) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if(idx < length) + { + if constexpr(std::is_same_v) + { + out_real[idx] = hipCrealf(in[idx]); + out_img[idx] = hipCimagf(in[idx]); + } + else if constexpr(std::is_same_v) + { + out_real[idx] = hipCreal(in[idx]); + out_img[idx] = hipCimag(in[idx]); + } + } + } + + /** + * \brief This function packs non-structured data (float / double) + * into structured data (hipFloatComplex / hipDoubleComplex). + */ + template + __global__ void pack(const InputType* in_real, InputType* in_img, OutputType *out, int length) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if(idx < length) + { + if constexpr(std::is_same_v) + { + out[idx] = make_hipFloatComplex((float)in_real[idx], (float)in_img[idx]); + } + else if constexpr(std::is_same_v) + { + out[idx] = make_hipDoubleComplex((double)in_real[idx], (double)in_img[idx]); + } + } + } + + struct DeviceDeleter + { + void operator()(void* ptr) + { + CHECK_HIP_ERROR(hipFree(ptr)); + } + }; + + template + auto allocDevice(int64_t numElements) + { + T* data; + CHECK_HIP_ERROR(hipMalloc(&data, numElements)); + return std::unique_ptr(data, DeviceDeleter()); + } + +} // namespace hiptensor + +#endif // HIPTENSOR_CONTRACTION_PACK_UTIL_HPP + diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index aec12e32..65ed8f34 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -101,6 +101,19 @@ namespace hiptensor ck::tensor_operation::element_wise::Bilinear, ck::bhalf_t>()); + // Bilinear complex f32 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + ck::Tuple, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear>()); + // Bilinear f64 registerSolutions( enumerateContractionSolutions<2, @@ -127,6 +140,19 @@ namespace hiptensor ck::tensor_operation::element_wise::Bilinear, double>()); + // Bilinear complex f64 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + ck::Tuple, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear>()); + // Scale bf16 registerSolutions( enumerateContractionSolutions<2, @@ -194,6 +220,20 @@ namespace hiptensor ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Scale, ck::bhalf_t>()); + + // scale complex f32 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + ck::Tuple<>, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale>()); + // Scale f64 registerSolutions( enumerateContractionSolutions<2, @@ -220,5 +260,18 @@ namespace hiptensor ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Scale, double>()); + // scale complex f64 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + ck::Tuple<>, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale>()); + } } // namespace hiptensor diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index eacac5b1..17bff3ca 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -45,6 +45,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -73,6 +74,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp new file mode 100644 index 00000000..2ffc9559 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -0,0 +1,595 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#ifndef HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP +#define HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP + +#include "../contraction_pack_util.hpp" +#include "common.hpp" +#include + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + + using hiptensor::allocDevice; + using hiptensor::ceilDiv; + using hiptensor::DeviceDeleter; + using hiptensor::elementSpaceFromLengthsAndStrides; + + using Bilinear = ck::tensor_operation::element_wise::Bilinear; + + // The following is a specialization class for bilinear contractions of complex types. + // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of + // the complex element type. + // The class implements a CK interface to wrap the 4 individual contraction operations and argument + // handling internally. + // Note: We are assuming that the data comes in as an Array of Structures (AOS) format in complex pairs. + // The argument initialization portion decomposes this data into structure of arrays (SOA) where the + // real and complex elements can be operated on separately. + + // Tensor Contraction: + // input : A + // input : B + // input : D0, D1, ... + // output : E + // C = a_op(A) * b_op(B) + // E = cde_op(C, D0, D1, ...) + // Assume: + // A[M0, M1, M2, ..., K0, K1, K2, ...] + // B[N0, N1, N2, ..., K0, K1, K2, ...] + // D[M0, M1, M2, ..., N0, N1, N2, ...] + // E[M0, M1, M2, ..., N0, N1, N2, ...] + template + struct DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + HIP_vector_type, + HIP_vector_type, + AccDataType, + CShuffleDataType, + ck::Tuple>, + HIP_vector_type, + AElementwiseOperation, + BElementwiseOperation, + Bilinear, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + ComputeDataType, + LoopSched> + + : public DeviceContractionMultipleD, + HIP_vector_type, + ck::Tuple>, + HIP_vector_type, + AElementwiseOperation, + BElementwiseOperation, + Bilinear, + ComputeDataType> + { + // Complex device Op + using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; + using CDEElementwiseOperation = Bilinear; + + // Complex types given through the interface + using ComplexA = HIP_vector_type; + using ComplexB = HIP_vector_type; + using ComplexDs = HIP_vector_type; + using ComplexE = HIP_vector_type; + + // Internal functional types we will use to + // decompose the complex types and operate on. + using DecompA = ADataType; + using DecompB = BDataType; + using DecompDs = DsDataType; + using DecompE = EDataType; + + // For complex types, we need to make sure that all of the types are the same + static_assert(std::is_same_v && std::is_same_v + && std::is_same_v + && std::is_same_v + && std::is_same_v, + "Complex operations must have the same data type"); + + static_assert(std::is_same_v || std::is_same_v, + "Complex operations only supported with single or double precision"); + + static constexpr index_t NumDTensor = 1; + + // The internal operation that we will decompose the complex operations with. + // For complex will be either float or double + using DecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + DecompA, + DecompB, + AccDataType, + CShuffleDataType, + ck::Tuple, + DecompE, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + ComputeDataType, + LoopSched>; + + // Argument + struct Argument : public BaseArgument + { + using DecompArg = typename DecompOp::Argument; + + Argument(Argument&& other) + : mArgs({std::move(other.mArgs[0]), + std::move(other.mArgs[1]), + std::move(other.mArgs[2]), + std::move(other.mArgs[3])}) + { + } + + Argument& operator=(Argument&& other) + { + if(this != &other) + { + mArgs[0] = std::move(other.mArgs[0]); + mArgs[1] = std::move(other.mArgs[1]); + mArgs[2] = std::move(other.mArgs[2]); + mArgs[3] = std::move(other.mArgs[3]); + } + return *this; + } + + Argument(const void* p_a_grid, + const void* p_b_grid, + std::array p_ds_grid, + void* p_e_grid, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + { + // Take the incoming arguments, treat them as complex. + + // Allocate Real and Imaginary inputs + auto elementsA + = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides); + auto elementsB + = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides); + auto elementsD = elementSpaceFromLengthsAndStrides(ds_ms_ns_lengths[0], + ds_ms_ns_strides[0]); + auto elementsE + = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); + + mA_real.reset(nullptr); + mA_imag.reset(nullptr); + mB_real.reset(nullptr); + mB_imag.reset(nullptr); + mD_real.reset(nullptr); + mD_imag.reset(nullptr); + mE_real.reset(nullptr); + mE_imag.reset(nullptr); + + auto blockDim = dim3(1024); + + auto decompGrid = [blockDim](auto& out_r, + auto& out_i, + auto const* input_grid, + uint32_t elementCount) { + using DecompT = typename std::decay_t::element_type; + static_assert(std::is_same_v< + DecompT, + typename std::decay_t::element_type>, + "r and i buffers must be same type"); + + if(input_grid != nullptr) + { + out_r = std::move(allocDevice(elementCount)); + out_i = std::move(allocDevice(elementCount)); + + auto gridDim = dim3(ceilDiv(elementCount, blockDim.x)); + hiptensor::unpack<<>>( + input_grid, out_r.get(), out_i.get(), elementCount); + } + }; + + decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA); + decompGrid(mB_real, mB_imag, (const ComplexA*)p_b_grid, elementsB); + decompGrid(mD_real, mD_imag, (const ComplexA*)p_ds_grid[0], elementsD); + decompGrid(mE_real, mE_imag, (const ComplexA*)p_e_grid, elementsE); + + auto allocArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& in_d, + auto const& cde_element_op) { + return std::make_unique( + in_a.get(), + in_b.get(), + std::array{in_d.get()}, + out_e.get(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + }; + + mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op); + mArgs[1] = allocArgs(mE_real, + mA_imag, + mB_imag, + mE_real, + CDEElementwiseOperation{cde_element_op.alpha_ * -1.0f, + cde_element_op.beta_}); + mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op); + mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, cde_element_op); + } + + void Print() const + { + std::cout << "Args0:" << std::endl; + mArgs[0]->Print(); + std::cout << "Args1:" << std::endl; + mArgs[1]->Print(); + std::cout << "Args2:" << std::endl; + mArgs[2]->Print(); + std::cout << "Args3:" << std::endl; + mArgs[3]->Print(); + } + + // private: + // Each argument set for complex: + std::unique_ptr mArgs[4]; + + template + using DeviceArray = std::unique_ptr; + + // Manage extra memory for AOS->SOA + DeviceArray mA_real; + DeviceArray mA_imag; + DeviceArray mB_real; + DeviceArray mB_imag; + DeviceArray mD_real; + DeviceArray mD_imag; + DeviceArray mE_real; + DeviceArray mE_imag; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = typename DeviceOp::Argument; + + Invoker() + : mInvoker(std::make_unique()) + { + } + + Invoker(Invoker&& other) + : mInvoker(std::move(other.mInvoker)) + { + } + + Invoker& operator=(Invoker&& other) + { + if(this != &other) + { + mInvoker = std::move(other.mInvoker); + } + return *this; + } + + float Run(const Argument& arg, + const StreamConfig& stream_config = StreamConfig{}) + { + auto r0 = mInvoker->Run(arg.mArgs[0].get(), stream_config); + auto r1 = mInvoker->Run(arg.mArgs[1].get(), stream_config); + auto r2 = mInvoker->Run(arg.mArgs[2].get(), stream_config); + auto r3 = mInvoker->Run(arg.mArgs[3].get(), stream_config); + + // Reduce results? + return r3; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + + std::unique_ptr mInvoker; + }; + + static bool IsSupportedArgument(const Argument& arg) + { + return DecompOp::IsSupportedArgument(*(arg.mArgs[0].get())) + && DecompOp::IsSupportedArgument(*(arg.mArgs[1].get())) + && DecompOp::IsSupportedArgument(*(arg.mArgs[2].get())) + && DecompOp::IsSupportedArgument(*(arg.mArgs[3].get())); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + // polymorphic + virtual void SetWorkSpacePointer(BaseArgument* p_arg, + void* p_workspace, + StreamConfig const& s + = StreamConfig{}) const override + { + // Call the base, then fwd to each arg. + this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s); + auto* arg = dynamic_cast(p_arg); + this->BaseOperator::SetWorkSpacePointer(arg->mArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer(arg->mArgs[1].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer(arg->mArgs[2].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer(arg->mArgs[3].get(), p_workspace, s); + } + + static auto MakeArgument( + const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + { + return Argument{p_a, + p_b, + p_ds, + p_e, + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op}; + } + + static auto MakeInvoker() + { + return Invoker{}; + } + + // polymorphic + std::unique_ptr MakeArgumentPointer( + const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) override + { + return std::make_unique(p_a, + p_b, + p_ds, + p_e, + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceContractionMultipleD_Xdl_CShuffle" + << "<" + << NumDimM << ", " + << NumDimN << ", " + << NumDimK << ", " + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << KPerBlock << ", " + << AK1 << ", " + << BK1 << ", " + << ABlockTransferSrcVectorDim << ", " + << BBlockTransferSrcVectorDim + << ">"; + // clang-format on + + return str.str(); + } + }; + + } // namespace device + } // namespace tensor_operation +} // namespace ck + +#endif // HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp new file mode 100644 index 00000000..fce71e8f --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp @@ -0,0 +1,105 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather +// than using default setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter +// of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + + template + using S = ck::Sequence; + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + using Bilinear = ck::tensor_operation::element_wise::Bilinear; + + static constexpr auto GemmMNKPadding + = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance + = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, float>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, float> + // clang-format on + >; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck + diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp new file mode 100644 index 00000000..96531ddd --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_complex.hpp @@ -0,0 +1,699 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#ifndef HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP +#define HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP + +#include "../contraction_pack_util.hpp" +#include "common.hpp" +#include + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + + using hiptensor::allocDevice; + using hiptensor::ceilDiv; + using hiptensor::DeviceDeleter; + using hiptensor::elementSpaceFromLengthsAndStrides; + + using Bilinear = ck::tensor_operation::element_wise::Bilinear; + using Scale = ck::tensor_operation::element_wise::Scale; + + // The following is a specialization class for bilinear contractions of complex types. + // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of + // the complex element type. + // The class implements a CK interface to wrap the 4 individual contraction operations and argument + // handling internally. + // Note: We are assuming that the data comes in as an Array of Structures (AOS) format in complex pairs. + // The argument initialization portion decomposes this data into structure of arrays (SOA) where the + // real and complex elements can be operated on separately. + + // Tensor Contraction: + // input : A + // input : B + // input : D0, D1, ... + // output : E + // C = a_op(A) * b_op(B) + // E = cde_op(C, D0, D1, ...) + // Assume: + // A[M0, M1, M2, ..., K0, K1, K2, ...] + // B[N0, N1, N2, ..., K0, K1, K2, ...] + // D[M0, M1, M2, ..., N0, N1, N2, ...] + // E[M0, M1, M2, ..., N0, N1, N2, ...] + template + struct DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + HIP_vector_type, + HIP_vector_type, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + HIP_vector_type, + AElementwiseOperation, + BElementwiseOperation, + Scale, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + ComputeDataType, + LoopSched> + + : public DeviceContractionMultipleD, + HIP_vector_type, + ck::Tuple<>, + HIP_vector_type, + AElementwiseOperation, + BElementwiseOperation, + Scale, + ComputeDataType> + { + // Complex device Op + using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; + + // CDE Operations + using ScaleCDEElementwiseOperation = Scale; + using BilinearCDEElementwiseOperation = Bilinear; + + // Complex types given through the interface + using ComplexA = HIP_vector_type; + using ComplexB = HIP_vector_type; + using ComplexDs = HIP_vector_type; + using ComplexE = HIP_vector_type; + + // Internal functional types we will use to + // decompose the complex types and operate on. + using DecompA = ADataType; + using DecompB = BDataType; + using DecompDs = EDataType; + using DecompE = EDataType; + + // For complex types, we need to make sure that all of the types are the same + static_assert(std::is_same_v && std::is_same_v + && std::is_same_v + && std::is_same_v, + "Complex operations must have the same data type"); + + static_assert(std::is_same_v || std::is_same_v, + "Complex operations only supported with single or double precision"); + + static constexpr index_t NumDTensor = 0; + + // The internal operation that we will decompose the complex operations with. + // For complex will be either float or double + using ScaleDecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + DecompA, + DecompB, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + DecompE, + AElementwiseOperation, + BElementwiseOperation, + ScaleCDEElementwiseOperation, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + ComputeDataType, + LoopSched>; + + // The internal operation that we will decompose the complex operations with. + // For complex will be either float or double + using BilinearDecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + DecompA, + DecompB, + AccDataType, + CShuffleDataType, + ck::Tuple, + DecompE, + AElementwiseOperation, + BElementwiseOperation, + BilinearCDEElementwiseOperation, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + ComputeDataType, + LoopSched>; + + // Argument + struct Argument : public BaseArgument + { + using ScaleDecompArgument = typename ScaleDecompOp::Argument; + using BilinearDecompArgument = typename BilinearDecompOp::Argument; + + Argument(Argument&& other) + : mScaleArgs({std::move(other.mScaleArgs)}) + , mBilinearArgs({std::move(other.mBilinearArgs[0]), + std::move(other.mBilinearArgs[1]), + std::move(other.mBilinearArgs[2])}) + { + } + + Argument& operator=(Argument&& other) + { + if(this != &other) + { + mScaleArgs = std::move(other.mScaleArgs); + mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); + mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); + mBilinearArgs[2] = std::move(other.mBilinearArgs[2]); + } + return *this; + } + + Argument(const void* p_a_grid, + const void* p_b_grid, + std::array p_ds_grid, + void* p_e_grid, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + ScaleCDEElementwiseOperation cde_element_op) + { + // Take the incoming arguments, treat them as complex. + + // Allocate Real and Imaginary inputs + auto elementsA + = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides); + auto elementsB + = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides); + auto elementsE + = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); + + mA_real.reset(nullptr); + mA_imag.reset(nullptr); + mB_real.reset(nullptr); + mB_imag.reset(nullptr); + mD_real.reset(nullptr); + mD_imag.reset(nullptr); + mE_real.reset(nullptr); + mE_imag.reset(nullptr); + + auto blockDim = dim3(1024); + + auto decompGrid = [blockDim](auto& out_r, + auto& out_i, + auto const* input_grid, + uint32_t elementCount) { + using DecompT = typename std::decay_t::element_type; + static_assert(std::is_same_v< + DecompT, + typename std::decay_t::element_type>, + "r and i buffers must be same type"); + + if(input_grid != nullptr) + { + out_r = std::move(allocDevice(elementCount)); + out_i = std::move(allocDevice(elementCount)); + + auto gridDim = dim3(ceilDiv(elementCount, blockDim.x)); + hiptensor::unpack<<>>( + input_grid, out_r.get(), out_i.get(), elementCount); + } + }; + + // Decompose the incoming data from AOS->SOA + decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA); + decompGrid(mB_real, mB_imag, (const ComplexA*)p_b_grid, elementsB); + decompGrid(mE_real, mE_imag, (const ComplexA*)p_e_grid, elementsE); + + // Allocate extra space bilinear op. + mD_real = std::move(allocDevice(elementsE)); + mD_imag = std::move(allocDevice(elementsE)); + + auto allocScaleArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& cde_element_op) { + return std::make_unique( + in_a.get(), + in_b.get(), + std::array{}, + out_e.get(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + }; + + auto allocBilinearArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& in_d, + auto const& cde_element_op) { + return std::make_unique( + in_a.get(), + in_b.get(), + std::array{in_d.get()}, + out_e.get(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{e_ms_ns_lengths}, + std::array, 1>{e_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + }; + + // Not sure about these... + mScaleArgs = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op); + mBilinearArgs[0] = allocBilinearArgs( + mE_real, + mA_imag, + mB_imag, + mE_real, + BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f}); + mBilinearArgs[1] = allocBilinearArgs( + mE_imag, + mA_real, + mB_imag, + mD_imag, + BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f}); + mBilinearArgs[2] = allocBilinearArgs( + mE_imag, + mA_imag, + mB_real, + mE_imag, + BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f}); + } + + void Print() const + { + std::cout << "ScaleArgs:" << std::endl; + mScaleArgs->Print(); + std::cout << "BilinearArgs0:" << std::endl; + mBilinearArgs[0]->Print(); + std::cout << "BilinearArgs1:" << std::endl; + mBilinearArgs[1]->Print(); + std::cout << "BilinearArgs2:" << std::endl; + mBilinearArgs[2]->Print(); + } + + // private: + // Each argument set for complex: + std::unique_ptr mScaleArgs; + std::unique_ptr mBilinearArgs[3]; + + template + using DeviceArray = std::unique_ptr; + + // Manage extra memory for AOS->SOA + DeviceArray mA_real; + DeviceArray mA_imag; + DeviceArray mB_real; + DeviceArray mB_imag; + DeviceArray mD_real; + DeviceArray mD_imag; + DeviceArray mE_real; + DeviceArray mE_imag; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = typename DeviceOp::Argument; + + Invoker() + : mScaleInvoker(std::make_unique()) + , mBilinearInvoker(std::make_unique()) + { + } + + Invoker(Invoker&& other) + : mScaleInvoker(std::move(other.mScaleInvoker)) + , mBilinearInvoker(std::move(other.mBilinearInvoker)) + { + } + + Invoker& operator=(Invoker&& other) + { + if(this != &other) + { + mScaleInvoker = std::move(other.mScaleInvoker); + mBilinearInvoker = std::move(other.mBilinearInvoker); + } + return *this; + } + + float Run(const Argument& arg, + const StreamConfig& stream_config = StreamConfig{}) + { + auto r0 = mScaleInvoker->Run(arg.mScaleArgs.get(), stream_config); + auto r1 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config); + auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config); + auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[2].get(), stream_config); + + // Reduce results? + return r3; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + + std::unique_ptr mScaleInvoker; + std::unique_ptr mBilinearInvoker; + }; + + static bool IsSupportedArgument(const Argument& arg) + { + return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs.get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[2].get())); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + // polymorphic + virtual void SetWorkSpacePointer(BaseArgument* p_arg, + void* p_workspace, + StreamConfig const& s + = StreamConfig{}) const override + { + // Call the base, then fwd to each arg. + this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s); + auto* arg = dynamic_cast(p_arg); + this->BaseOperator::SetWorkSpacePointer(arg->mScaleArgs.get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[1].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[2].get(), p_workspace, s); + } + + static auto MakeArgument( + const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + ScaleCDEElementwiseOperation cde_element_op) + { + return Argument{p_a, + p_b, + p_ds, + p_e, + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op}; + } + + static auto MakeInvoker() + { + return Invoker{}; + } + + // polymorphic + std::unique_ptr MakeArgumentPointer( + const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + ScaleCDEElementwiseOperation cde_element_op) override + { + return std::make_unique(p_a, + p_b, + p_ds, + p_e, + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceContractionMultipleD_Xdl_CShuffle" + << "<" + << NumDimM << ", " + << NumDimN << ", " + << NumDimK << ", " + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << KPerBlock << ", " + << AK1 << ", " + << BK1 << ", " + << ABlockTransferSrcVectorDim << ", " + << BBlockTransferSrcVectorDim + << ">"; + // clang-format on + + return str.str(); + } + }; + + } // namespace device + } // namespace tensor_operation +} // namespace ck + +#endif // HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP + diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp new file mode 100644 index 00000000..dbec7ebe --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp @@ -0,0 +1,105 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather +// than using default setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter +// of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + + template + using S = ck::Sequence; + + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + using Scale = ck::tensor_operation::element_wise::Scale; + + static constexpr auto GemmMNKPadding + = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] + // k/k/n are the fast changing dimension for A/B/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance + = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, F32 >, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, F32 > + // clang-format on + >; + + void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck + diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp index 38e9f186..09df158f 100644 --- a/library/src/data_types.cpp +++ b/library/src/data_types.cpp @@ -79,6 +79,14 @@ namespace hiptensor { return sizeof(uint64_t); } + else if(id == HIP_C_32F) + { + return sizeof(hipFloatComplex); + } + else if(id == HIP_C_64F) + { + return sizeof(hipDoubleComplex); + } else if(id == NONE_TYPE) { return 0; @@ -102,11 +110,11 @@ namespace hiptensor { return HIPTENSOR_COMPUTE_16F; } - else if(hipType == HIP_R_32F) + else if(hipType == HIP_R_32F || hipType == HIP_C_32F) { return HIPTENSOR_COMPUTE_32F; } - else if(hipType == HIP_R_64F) + else if(hipType == HIP_R_64F || hipType == HIP_C_64F) { return HIPTENSOR_COMPUTE_64F; } @@ -187,11 +195,11 @@ bool operator==(hipDataType hipType, hiptensorComputeType_t computeType) { return (computeType == HIPTENSOR_COMPUTE_16F); } - else if(hipType == HIP_R_32F) + else if(hipType == HIP_R_32F || hipType == HIP_C_32F) { return (computeType == HIPTENSOR_COMPUTE_32F); } - else if(hipType == HIP_R_64F) + else if(hipType == HIP_R_64F || hipType == HIP_C_64F) { return (computeType == HIPTENSOR_COMPUTE_64F); } diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp index 51af1f48..8d185758 100644 --- a/library/src/hiptensor.cpp +++ b/library/src/hiptensor.cpp @@ -153,7 +153,8 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t* han if((lens == nullptr) || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F) - && (dataType != HIP_R_64F)) + && (dataType != HIP_R_64F) && (dataType != HIP_C_32F) + && (dataType != HIP_C_64F)) || unaryOp != HIPTENSOR_OP_IDENTITY) { auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp index 19ccca6c..97402fa3 100644 --- a/library/src/include/data_types.hpp +++ b/library/src/include/data_types.hpp @@ -31,6 +31,7 @@ // Include order needs to be preserved #include #include +#include #include #include diff --git a/library/src/include/data_types_impl.hpp b/library/src/include/data_types_impl.hpp index 7df6d7d9..ef3e7c77 100644 --- a/library/src/include/data_types_impl.hpp +++ b/library/src/include/data_types_impl.hpp @@ -105,6 +105,18 @@ namespace hiptensor static constexpr auto value = HIP_R_64U; }; + template <> + struct HipDataType + { + static constexpr auto value = HIP_C_32F; + }; + + template <> + struct HipDataType + { + static constexpr auto value = HIP_C_64F; + }; + template <> struct HipDataType { @@ -162,6 +174,14 @@ namespace hiptensor { return static_cast(*(uint64_t*)value); } + else if constexpr(std::is_same_v && id == HIP_C_32F) + { + return static_cast(*(hipFloatComplex*)value); + } + else if constexpr(std::is_same_v && id == HIP_C_64F) + { + return static_cast(*(hipDoubleComplex*)value); + } else { #if !NDEBUG diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index 00393f1d..c51a2dbc 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -31,6 +31,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32 simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp) add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) @@ -38,6 +39,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_cf32_cf32_cf32_compute_f32 simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp) add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) diff --git a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp new file mode 100644 index 00000000..25392592 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef hipFloatComplex ADataType; + typedef hipFloatComplex BDataType; + typedef hipFloatComplex CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_C_32F; + constexpr hipDataType typeB = HIP_C_32F; + constexpr hipDataType typeC = HIP_C_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp new file mode 100644 index 00000000..7fc5c3a3 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef hipFloatComplex ADataType; + typedef hipFloatComplex BDataType; + typedef hipFloatComplex DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_C_32F; + constexpr hipDataType typeB = HIP_C_32F; + constexpr hipDataType typeD = HIP_C_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); +} diff --git a/test/00_unit/yaml_test.cpp b/test/00_unit/yaml_test.cpp index 2efc6b6e..372fbbdd 100644 --- a/test/00_unit/yaml_test.cpp +++ b/test/00_unit/yaml_test.cpp @@ -79,9 +79,13 @@ int main(int argc, char* argv[]) yee.mDataTypes = { // clang-format off {HIP_R_32F, HIP_R_32F, hiptensor::NONE_TYPE, HIP_R_32F, HIP_R_32F}, // scale F32 + {HIP_C_32F, HIP_C_32F, hiptensor::NONE_TYPE, HIP_C_32F, HIP_C_32F}, // scale F32 Complex {HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F}, // bilinear F32 + {HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F}, // bilinear F32 Complex {HIP_R_64F, HIP_R_64F, hiptensor::NONE_TYPE, HIP_R_64F, HIP_R_64F}, // scale F64 + {HIP_C_64F, HIP_C_64F, hiptensor::NONE_TYPE, HIP_C_64F, HIP_C_64F}, // scale F64 Complex {HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F}, // bilinear F64 + {HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F}, // bilinear F64 Complex // clang-format on }; yee.mAlgorithms diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index 76cc3033..2059fd73 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -57,8 +57,8 @@ namespace hiptensor bool ContractionTest::checkDevice(hipDataType datatype) const { return (isF32Supported() - && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF)) - || (isF64Supported() && datatype == HIP_R_64F); + && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF || datatype == HIP_C_32F)) + || (isF64Supported() && (datatype == HIP_R_64F || datatype == HIP_C_64F)); } bool ContractionTest::checkSizes() const @@ -117,14 +117,18 @@ namespace hiptensor auto DDataType = testType[3]; EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF) - || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)); + || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F) + || (ADataType == HIP_C_32F) || (ADataType == HIP_C_64F)); EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF) - || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)); + || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F) + || (BDataType == HIP_C_32F) || (BDataType == HIP_C_64F)); EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF) || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) + || (CDataType == HIP_C_32F) || (CDataType == HIP_C_64F) || (CDataType == NONE_TYPE)); EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF) - || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); + || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F) + || (DDataType == HIP_C_32F) || (DDataType == HIP_C_64F)); EXPECT_TRUE( (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF) || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F)); @@ -290,6 +294,32 @@ namespace hiptensor elementsCD, std::numeric_limits::signaling_NaN()); } + else if(ADataType == HIP_C_32F && BDataType == HIP_C_32F && DDataType == HIP_C_32F) + { + // Initialize matrix data on device + fillLaunchKernel((hipFloatComplex*)resource->deviceA().get(), elementsA); + fillLaunchKernel((hipFloatComplex*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_C_32F) + { + fillLaunchKernel((hipFloatComplex*)resource->deviceC().get(), elementsCD); + } + fillValLaunchKernel((hipFloatComplex*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); + } + else if(ADataType == HIP_C_64F && BDataType == HIP_C_64F && DDataType == HIP_C_64F) + { + // Initialize matrix data on device + fillLaunchKernel((hipDoubleComplex*)resource->deviceA().get(), elementsA); + fillLaunchKernel((hipDoubleComplex*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_C_64F) + { + fillLaunchKernel((hipDoubleComplex*)resource->deviceC().get(), elementsCD); + } + fillValLaunchKernel((hipDoubleComplex*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); + } resource->copyDeviceToHostAll(elementBytes); @@ -446,7 +476,7 @@ namespace hiptensor hiptensorPrintArrayElements(stream, (float*)D.get(), elementsCD); stream << std::endl; } - else + else if(DDataType == HIP_R_64F) { stream << "Tensor A elements:\n"; hiptensorPrintArrayElements( @@ -467,6 +497,48 @@ namespace hiptensor hiptensorPrintArrayElements(stream, (double*)D.get(), elementsCD); stream << std::endl; } + else if(DDataType == HIP_C_32F) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements( + stream, (hipFloatComplex*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements( + stream, (hipFloatComplex*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements( + stream, (hipFloatComplex*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements(stream, (hipFloatComplex*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_C_64F) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements( + stream, (hipDoubleComplex*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements( + stream, (hipDoubleComplex*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements( + stream, (hipDoubleComplex*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements(stream, (hipDoubleComplex*)D.get(), elementsCD); + stream << std::endl; + } } } } @@ -566,12 +638,12 @@ namespace hiptensor (hip_bfloat16*)reference.get(), elementsCD); } - else if(DDataType == HIP_R_32F) + else if(DDataType == HIP_R_32F || DDataType == HIP_C_32F) { std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD); } - else if(DDataType == HIP_R_64F) + else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F) { std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( (double*)resource->deviceD().get(), (double*)reference.get(), elementsCD); diff --git a/test/device/common.hpp b/test/device/common.hpp index 172e6953..392c74c9 100644 --- a/test/device/common.hpp +++ b/test/device/common.hpp @@ -72,8 +72,21 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed) if(index < elementSize) { - auto value = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize; - data[index] = static_cast(value); + if constexpr(std::is_same_v) + { + auto value = (float(index / float(RAND_MAX) - 0.5) * 100) / elementSize; + data[index] = make_hipFloatComplex(value, value); + } + else if constexpr(std::is_same_v) + { + auto value = (double(index / double(RAND_MAX) - 0.5) * 100) / elementSize; + data[index] = make_hipDoubleComplex(value, value); + } + else + { + auto value = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize; + data[index] = static_cast(value); + } } } diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp index cd3eb46f..5c674045 100644 --- a/test/llvm/yaml_parser_config.cpp +++ b/test/llvm/yaml_parser_config.cpp @@ -113,6 +113,8 @@ namespace llvm io.enumCase(value, "HIP_R_16BF", HIP_R_16BF); io.enumCase(value, "HIP_R_32F", HIP_R_32F); io.enumCase(value, "HIP_R_64F", HIP_R_64F); + io.enumCase(value, "HIP_C_32F", HIP_C_32F); + io.enumCase(value, "HIP_C_64F", HIP_C_64F); io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE); } }; From be8c0b5566ada525a694dc44afafccff2dccb643 Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Tue, 12 Dec 2023 12:11:20 -0500 Subject: [PATCH 30/42] Add compute types - Add compute types - Add instance factory for scale --- .../contraction_solution_instances.cpp | 12 ++-- ..._c_shuffle_cf32_cf32_cf32_kkn_instance.cpp | 57 ++++++++++++++++++- 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index 65ed8f34..51e31635 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -112,7 +112,8 @@ namespace hiptensor hipFloatComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); // Bilinear f64 registerSolutions( @@ -151,7 +152,8 @@ namespace hiptensor hipDoubleComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + double>()); // Scale bf16 registerSolutions( @@ -232,7 +234,8 @@ namespace hiptensor hipFloatComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); // Scale f64 registerSolutions( @@ -271,7 +274,8 @@ namespace hiptensor hipDoubleComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + double>()); } } // namespace hiptensor diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp index dbec7ebe..e6d5d15d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp @@ -57,7 +57,7 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] // k/k/n are the fast changing dimension for A/B/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance = std::tuple< // clang-format off //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute| @@ -80,7 +80,7 @@ namespace ck // clang-format on >; - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( + void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( std::vector + struct DeviceOperationInstanceFactory, + HIP_vector_type, + ck::Tuple<>, + HIP_vector_type, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ComputeDataType>> + { + using DeviceOp = DeviceContractionMultipleD, + HIP_vector_type, + ck::Tuple<>, + HIP_vector_type, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ComputeDataType>; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) + { + if constexpr(is_same_v) + { + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( + op_ptrs); + } + } + } + } + }; } // namespace instance } // namespace device } // namespace tensor_operation From f6e39c14fda8d2e2eafd4f38d1f29509e5474746 Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Wed, 13 Dec 2023 10:01:44 -0500 Subject: [PATCH 31/42] Modify instance factory - Modify complex scale impl - Modify pack func --- .../src/contraction/contraction_pack_util.hpp | 2 +- .../contraction_solution_instances.cpp | 4 + library/src/contraction/device/CMakeLists.txt | 18 +- .../device_contraction_bilinear_complex.hpp | 42 ++- ...ffle_cf32_cf32_cf32_cf32_kknn_instance.cpp | 51 ++-- ...ffle_cf32_cf32_cf32_cf32_knnn_instance.cpp | 89 ++++++ ...ffle_cf32_cf32_cf32_cf32_mknn_instance.cpp | 89 ++++++ ...ffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp | 89 ++++++ ...ffle_cf64_cf64_cf64_cf64_kknn_instance.cpp | 90 ++++++ ...ffle_cf64_cf64_cf64_cf64_knnn_instance.cpp | 89 ++++++ ...ffle_cf64_cf64_cf64_cf64_mknn_instance.cpp | 89 ++++++ ...ffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp | 89 ++++++ .../device_contraction_scale_complex.hpp | 89 +++--- ..._c_shuffle_cf32_cf32_cf32_kkn_instance.cpp | 129 +++------ ..._c_shuffle_cf32_cf32_cf32_knn_instance.cpp | 89 ++++++ ..._c_shuffle_cf32_cf32_cf32_mkn_instance.cpp | 89 ++++++ ..._c_shuffle_cf32_cf32_cf32_mnn_instance.cpp | 89 ++++++ ..._c_shuffle_cf64_cf64_cf64_kkn_instance.cpp | 91 +++++++ ..._c_shuffle_cf64_cf64_cf64_knn_instance.cpp | 89 ++++++ ..._c_shuffle_cf64_cf64_cf64_mkn_instance.cpp | 89 ++++++ ..._c_shuffle_cf64_cf64_cf64_mnn_instance.cpp | 88 ++++++ ...tensor_contraction_bilinear_instances.hpp} | 169 ++++++------ ...hiptensor_contraction_scale_instances.hpp} | 257 ++++++++++-------- 23 files changed, 1626 insertions(+), 383 deletions(-) create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp rename library/src/contraction/device/{hiptensor_contraction_bilinear.hpp => hiptensor_contraction_bilinear_instances.hpp} (75%) rename library/src/contraction/device/{hiptensor_contraction_scale.hpp => hiptensor_contraction_scale_instances.hpp} (63%) diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp index 49741547..f242f3ea 100644 --- a/library/src/contraction/contraction_pack_util.hpp +++ b/library/src/contraction/contraction_pack_util.hpp @@ -91,7 +91,7 @@ namespace hiptensor auto allocDevice(int64_t numElements) { T* data; - CHECK_HIP_ERROR(hipMalloc(&data, numElements)); + CHECK_HIP_ERROR(hipMalloc(&data, numElements * sizeof(T))); return std::unique_ptr(data, DeviceDeleter()); } diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index 51e31635..2cec41bc 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -27,6 +27,10 @@ #include "contraction_solution_instances.hpp" #include "contraction_solution.hpp" +// Ensure access to +#include "device/hiptensor_contraction_bilinear_instances.hpp" +#include "device/hiptensor_contraction_scale_instances.hpp" + namespace hiptensor { ContractionSolutionInstances::ContractionSolutionInstances() diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index 17bff3ca..3ac03149 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -29,6 +29,14 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -45,7 +53,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -58,6 +65,14 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -74,7 +89,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp index 2ffc9559..d57c4fdf 100644 --- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -294,7 +294,7 @@ namespace ck = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides); auto elementsD = elementSpaceFromLengthsAndStrides(ds_ms_ns_lengths[0], ds_ms_ns_strides[0]); - auto elementsE + elementsE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); mA_real.reset(nullptr); @@ -305,7 +305,10 @@ namespace ck mD_imag.reset(nullptr); mE_real.reset(nullptr); mE_imag.reset(nullptr); + mE_real_buf.reset(nullptr); + mE_imag_buf.reset(nullptr); + mE_grid = p_e_grid; auto blockDim = dim3(1024); auto decompGrid = [blockDim](auto& out_r, @@ -330,9 +333,13 @@ namespace ck }; decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA); - decompGrid(mB_real, mB_imag, (const ComplexA*)p_b_grid, elementsB); - decompGrid(mD_real, mD_imag, (const ComplexA*)p_ds_grid[0], elementsD); - decompGrid(mE_real, mE_imag, (const ComplexA*)p_e_grid, elementsE); + decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB); + decompGrid(mD_real, mD_imag, (const ComplexDs*)p_ds_grid[0], elementsD); + decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE); + + // Allocate extra space for intermediate results. + mE_real_buf = std::move(allocDevice(elementsE)); + mE_imag_buf = std::move(allocDevice(elementsE)); auto allocArgs = [a_ms_ks_lengths, a_ms_ks_strides, @@ -366,15 +373,16 @@ namespace ck cde_element_op); }; - mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op); + mArgs[0] = allocArgs(mE_real_buf, mA_real, mB_real, mD_real, cde_element_op); mArgs[1] = allocArgs(mE_real, mA_imag, mB_imag, - mE_real, + mE_real_buf, CDEElementwiseOperation{cde_element_op.alpha_ * -1.0f, - cde_element_op.beta_}); - mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op); - mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, cde_element_op); + 1.0f}); + mArgs[2] = allocArgs(mE_imag_buf, mA_real, mB_imag, mD_imag, cde_element_op); + mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag_buf, + CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f}); } void Print() const @@ -405,6 +413,11 @@ namespace ck DeviceArray mD_imag; DeviceArray mE_real; DeviceArray mE_imag; + DeviceArray mE_real_buf; + DeviceArray mE_imag_buf; + + void* mE_grid; + index_t elementsE; }; // Invoker @@ -439,8 +452,15 @@ namespace ck auto r2 = mInvoker->Run(arg.mArgs[2].get(), stream_config); auto r3 = mInvoker->Run(arg.mArgs[3].get(), stream_config); - // Reduce results? - return r3; + if(arg.mE_grid != nullptr) + { + auto blockDim = dim3(1024); + auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); + hiptensor::pack<<>>( + arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); + } + + return r0 + r1 + r2 + r3; } // polymorphic diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp index fce71e8f..03514f47 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp @@ -33,6 +33,12 @@ #include "common.hpp" #include "device_contraction_bilinear_complex.hpp" +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + namespace ck { namespace tensor_operation @@ -45,39 +51,19 @@ namespace ck using CF32 = hipFloatComplex; using CF32_Tuple = ck::Tuple; - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, float>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32, CF32, float , float , CF32_Tuple, CF32 , PassThrough, PassThrough , Bilinear , GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, float> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance( @@ -91,7 +77,7 @@ namespace ck PassThrough, PassThrough, Bilinear, - float>>>& instances) + F32>>>& instances) { add_device_operation_instances( instances, @@ -102,4 +88,3 @@ namespace ck } // namespace device } // namespace tensor_operation } // namespace ck - diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp new file mode 100644 index 00000000..bb1ccde5 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp new file mode 100644 index 00000000..2d47acc0 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp new file mode 100644 index 00000000..4c881c0a --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp new file mode 100644 index 00000000..ed2ba843 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather +// than using default setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter +// of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp new file mode 100644 index 00000000..03dd9293 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp new file mode 100644 index 00000000..c44a5daf --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp new file mode 100644 index 00000000..d045a404 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp index 96531ddd..94f3b7f1 100644 --- a/library/src/contraction/device/device_contraction_scale_complex.hpp +++ b/library/src/contraction/device/device_contraction_scale_complex.hpp @@ -301,10 +301,10 @@ namespace ck using BilinearDecompArgument = typename BilinearDecompOp::Argument; Argument(Argument&& other) - : mScaleArgs({std::move(other.mScaleArgs)}) + : mScaleArgs({std::move(other.mScaleArgs[0]), + std::move(other.mScaleArgs[1])}) , mBilinearArgs({std::move(other.mBilinearArgs[0]), - std::move(other.mBilinearArgs[1]), - std::move(other.mBilinearArgs[2])}) + std::move(other.mBilinearArgs[1])}) { } @@ -312,10 +312,10 @@ namespace ck { if(this != &other) { - mScaleArgs = std::move(other.mScaleArgs); + mScaleArgs[0] = std::move(other.mScaleArgs[0]); + mScaleArgs[1] = std::move(other.mScaleArgs[1]); mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); - mBilinearArgs[2] = std::move(other.mBilinearArgs[2]); } return *this; } @@ -350,11 +350,12 @@ namespace ck mA_imag.reset(nullptr); mB_real.reset(nullptr); mB_imag.reset(nullptr); - mD_real.reset(nullptr); - mD_imag.reset(nullptr); mE_real.reset(nullptr); mE_imag.reset(nullptr); + mE_real_buf.reset(nullptr); + mE_imag_buf.reset(nullptr); + mE_grid = p_e_grid; auto blockDim = dim3(1024); auto decompGrid = [blockDim](auto& out_r, @@ -380,12 +381,12 @@ namespace ck // Decompose the incoming data from AOS->SOA decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA); - decompGrid(mB_real, mB_imag, (const ComplexA*)p_b_grid, elementsB); - decompGrid(mE_real, mE_imag, (const ComplexA*)p_e_grid, elementsE); + decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB); + decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE); - // Allocate extra space bilinear op. - mD_real = std::move(allocDevice(elementsE)); - mD_imag = std::move(allocDevice(elementsE)); + // Allocate extra space for intermediate results to bilinear op. + mE_real_buf = std::move(allocDevice(elementsE)); + mE_imag_buf = std::move(allocDevice(elementsE)); auto allocScaleArgs = [a_ms_ks_lengths, a_ms_ks_strides, @@ -450,44 +451,38 @@ namespace ck cde_element_op); }; - // Not sure about these... - mScaleArgs = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op); + mScaleArgs[0] = allocScaleArgs(mE_real_buf, mA_real, mB_real, cde_element_op); + mScaleArgs[1] = allocScaleArgs(mE_imag_buf, mA_real, mB_imag, cde_element_op); mBilinearArgs[0] = allocBilinearArgs( mE_real, mA_imag, mB_imag, - mE_real, + mE_real_buf, BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f}); mBilinearArgs[1] = allocBilinearArgs( - mE_imag, - mA_real, - mB_imag, - mD_imag, - BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f}); - mBilinearArgs[2] = allocBilinearArgs( mE_imag, mA_imag, mB_real, - mE_imag, + mE_imag_buf, BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f}); } void Print() const { - std::cout << "ScaleArgs:" << std::endl; - mScaleArgs->Print(); + std::cout << "ScaleArgs0:" << std::endl; + mScaleArgs[0]->Print(); + std::cout << "ScaleArgs1:" << std::endl; + mScaleArgs[1]->Print(); std::cout << "BilinearArgs0:" << std::endl; mBilinearArgs[0]->Print(); std::cout << "BilinearArgs1:" << std::endl; mBilinearArgs[1]->Print(); - std::cout << "BilinearArgs2:" << std::endl; - mBilinearArgs[2]->Print(); } // private: // Each argument set for complex: - std::unique_ptr mScaleArgs; - std::unique_ptr mBilinearArgs[3]; + std::unique_ptr mScaleArgs[2]; + std::unique_ptr mBilinearArgs[2]; template using DeviceArray = std::unique_ptr; @@ -497,10 +492,13 @@ namespace ck DeviceArray mA_imag; DeviceArray mB_real; DeviceArray mB_imag; - DeviceArray mD_real; - DeviceArray mD_imag; DeviceArray mE_real; DeviceArray mE_imag; + DeviceArray mE_real_buf; + DeviceArray mE_imag_buf; + + void* mE_grid; + index_t elementsE; }; // Invoker @@ -533,13 +531,20 @@ namespace ck float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - auto r0 = mScaleInvoker->Run(arg.mScaleArgs.get(), stream_config); - auto r1 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config); - auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config); - auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[2].get(), stream_config); + auto r0 = mScaleInvoker->Run(arg.mScaleArgs[0].get(), stream_config); + auto r1 = mScaleInvoker->Run(arg.mScaleArgs[1].get(), stream_config); + auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config); + auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config); - // Reduce results? - return r3; + if(arg.mE_grid != nullptr) + { + auto blockDim = dim3(1024); + auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); + hiptensor::pack<<>>( + arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); + } + + return r0 + r1 + r2 + r3; } // polymorphic @@ -555,10 +560,10 @@ namespace ck static bool IsSupportedArgument(const Argument& arg) { - return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs.get())) + return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[0].get())) + && ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[1].get())) && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get())) - && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get())) - && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[2].get())); + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get())); } // polymorphic @@ -576,13 +581,13 @@ namespace ck // Call the base, then fwd to each arg. this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s); auto* arg = dynamic_cast(p_arg); - this->BaseOperator::SetWorkSpacePointer(arg->mScaleArgs.get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer(arg->mScaleArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mScaleArgs[1].get(), p_workspace, s); this->BaseOperator::SetWorkSpacePointer( arg->mBilinearArgs[0].get(), p_workspace, s); this->BaseOperator::SetWorkSpacePointer( arg->mBilinearArgs[1].get(), p_workspace, s); - this->BaseOperator::SetWorkSpacePointer( - arg->mBilinearArgs[2].get(), p_workspace, s); } static auto MakeArgument( diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp index e6d5d15d..3352556d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp @@ -33,6 +33,12 @@ #include "common.hpp" #include "device_contraction_scale_complex.hpp" +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + namespace ck { namespace tensor_operation @@ -46,109 +52,38 @@ namespace ck using CF32 = hipFloatComplex; using Empty_Tuple = ck::Tuple<>; - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| Data| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| Type| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, F32 >, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, CF32 , CF32 , F32 , F32 , Empty_Tuple, CF32 , PassThrough , PassThrough , Scale , GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4, F32 > - // clang-format on - >; - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( - std::vector>>& instances) - { + = device_contraction_kk_instance; + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( + std::vector>>& instances) + { add_device_operation_instances( instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance{}); } - // Contraction + Scale - template - struct DeviceOperationInstanceFactory, - HIP_vector_type, - ck::Tuple<>, - HIP_vector_type, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, - ComputeDataType>> - { - using DeviceOp = DeviceContractionMultipleD, - HIP_vector_type, - ck::Tuple<>, - HIP_vector_type, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, - ComputeDataType>; - - static auto GetInstances() - { - std::vector> op_ptrs; - - if constexpr(is_same_v && is_same_v && - is_same_v) - { - if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) - { - if constexpr(is_same_v) - { - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( - op_ptrs); - } - } - } - } - }; } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp new file mode 100644 index 00000000..cfd6c7f4 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp new file mode 100644 index 00000000..eacc1148 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp new file mode 100644 index 00000000..b5e79372 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp new file mode 100644 index 00000000..c0934498 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp @@ -0,0 +1,91 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather +// than using default setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter +// of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp new file mode 100644 index 00000000..8514cb70 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp new file mode 100644 index 00000000..09d589d6 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp new file mode 100644 index 00000000..6b90050b --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp @@ -0,0 +1,88 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance{}); + } + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/hiptensor_contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp similarity index 75% rename from library/src/contraction/device/hiptensor_contraction_bilinear.hpp rename to library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp index e8f73b58..eac0f117 100644 --- a/library/src/contraction/device/hiptensor_contraction_bilinear.hpp +++ b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp @@ -37,120 +37,126 @@ namespace ck { namespace instance { - - // float + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance( std::vector>>& - instances); + Bilinear, + F32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance( std::vector>>& - instances); + Bilinear, + F32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance( std::vector>>& - instances); + Bilinear, + F32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance( std::vector>>& - instances); + Bilinear, + F32>>>& instances); // double void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance( std::vector>>& - instances); + Bilinear, + F64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance( std::vector>>& - instances); + Bilinear, + F64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance( std::vector>>& - instances); + Bilinear, + F64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance( std::vector>>& - instances); + Bilinear, + F64>>>& instances); // Contraction + Bilinear template + typename DsDataType, + typename EDataType, + typename ComputeDataT> struct DeviceOperationInstanceFactory< ck::tensor_operation::device::DeviceContractionMultipleD< NumDimM, NumDimN, NumDimK, - ADataType, - BDataType, - ck::Tuple, - EDataType, + HIP_vector_type, + HIP_vector_type, + ck::Tuple>, + HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>> + ck::tensor_operation::element_wise::Bilinear, + ComputeDataT>> { using DeviceOp = DeviceContractionMultipleD< NumDimM, NumDimN, NumDimK, - ADataType, - BDataType, - ck::Tuple, - EDataType, + HIP_vector_type, + HIP_vector_type, + ck::Tuple>, + HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>; + ck::tensor_operation::element_wise::Bilinear, + ComputeDataT>; static auto GetInstances() { std::vector> op_ptrs; if constexpr(is_same_v && is_same_v - && is_same_v && is_same_v) + && is_same_v && is_same_v) { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance( op_ptrs); } } if constexpr(is_same_v && is_same_v - && is_same_v + && is_same_v && is_same_v) { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance( op_ptrs); } } diff --git a/library/src/contraction/device/hiptensor_contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp similarity index 63% rename from library/src/contraction/device/hiptensor_contraction_scale.hpp rename to library/src/contraction/device/hiptensor_contraction_scale_instances.hpp index 916f79de..fff9dca6 100644 --- a/library/src/contraction/device/hiptensor_contraction_scale.hpp +++ b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp @@ -37,136 +37,159 @@ namespace ck { namespace instance { - - // float - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( - std::vector>>& instances); - - // double - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( - std::vector>>& instances); - + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + + using F64 = double; + using CF64 = hipDoubleComplex; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance( + std::vector>>& instances); + // Contraction + Scale template - struct HipTensorDeviceOperationInstanceFactory< + typename EDataType, + typename ComputeDataType> + struct DeviceOperationInstanceFactory< ck::tensor_operation::device::DeviceContractionMultipleD< NumDimM, NumDimN, NumDimK, - ADataType, - BDataType, + HIP_vector_type, + HIP_vector_type, ck::Tuple<>, - EDataType, + HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>> + ck::tensor_operation::element_wise::Scale, + ComputeDataType>> { using DeviceOp = DeviceContractionMultipleD< NumDimM, NumDimN, NumDimK, - ADataType, - BDataType, + HIP_vector_type, + HIP_vector_type, ck::Tuple<>, - EDataType, + HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>; + ck::tensor_operation::element_wise::Scale, + ComputeDataType>; static auto GetInstances() { @@ -177,13 +200,13 @@ namespace ck { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance( op_ptrs); } } @@ -193,13 +216,13 @@ namespace ck { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance( op_ptrs); } } From 075cdfa071ea33406ebc307cae85e3b05228599a Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Fri, 15 Dec 2023 02:11:27 -0500 Subject: [PATCH 32/42] Enable complex f32 - Remove intermediate buffers --- .../device_contraction_bilinear_complex.hpp | 16 ++++------------ .../device_contraction_scale_complex.hpp | 18 +++++------------- .../configs/bilinear_test_params.yaml | 16 +++++++++------- .../configs/scale_test_params.yaml | 2 ++ 4 files changed, 20 insertions(+), 32 deletions(-) diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp index d57c4fdf..2df240c4 100644 --- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -305,8 +305,6 @@ namespace ck mD_imag.reset(nullptr); mE_real.reset(nullptr); mE_imag.reset(nullptr); - mE_real_buf.reset(nullptr); - mE_imag_buf.reset(nullptr); mE_grid = p_e_grid; auto blockDim = dim3(1024); @@ -337,10 +335,6 @@ namespace ck decompGrid(mD_real, mD_imag, (const ComplexDs*)p_ds_grid[0], elementsD); decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE); - // Allocate extra space for intermediate results. - mE_real_buf = std::move(allocDevice(elementsE)); - mE_imag_buf = std::move(allocDevice(elementsE)); - auto allocArgs = [a_ms_ks_lengths, a_ms_ks_strides, b_ns_ks_lengths, @@ -373,15 +367,15 @@ namespace ck cde_element_op); }; - mArgs[0] = allocArgs(mE_real_buf, mA_real, mB_real, mD_real, cde_element_op); + mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op); mArgs[1] = allocArgs(mE_real, mA_imag, mB_imag, - mE_real_buf, + mE_real, CDEElementwiseOperation{cde_element_op.alpha_ * -1.0f, 1.0f}); - mArgs[2] = allocArgs(mE_imag_buf, mA_real, mB_imag, mD_imag, cde_element_op); - mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag_buf, + mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op); + mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f}); } @@ -413,8 +407,6 @@ namespace ck DeviceArray mD_imag; DeviceArray mE_real; DeviceArray mE_imag; - DeviceArray mE_real_buf; - DeviceArray mE_imag_buf; void* mE_grid; index_t elementsE; diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp index 94f3b7f1..f3a7fd2e 100644 --- a/library/src/contraction/device/device_contraction_scale_complex.hpp +++ b/library/src/contraction/device/device_contraction_scale_complex.hpp @@ -343,7 +343,7 @@ namespace ck = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides); auto elementsB = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides); - auto elementsE + elementsE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); mA_real.reset(nullptr); @@ -352,8 +352,6 @@ namespace ck mB_imag.reset(nullptr); mE_real.reset(nullptr); mE_imag.reset(nullptr); - mE_real_buf.reset(nullptr); - mE_imag_buf.reset(nullptr); mE_grid = p_e_grid; auto blockDim = dim3(1024); @@ -384,10 +382,6 @@ namespace ck decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB); decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE); - // Allocate extra space for intermediate results to bilinear op. - mE_real_buf = std::move(allocDevice(elementsE)); - mE_imag_buf = std::move(allocDevice(elementsE)); - auto allocScaleArgs = [a_ms_ks_lengths, a_ms_ks_strides, b_ns_ks_lengths, @@ -451,19 +445,19 @@ namespace ck cde_element_op); }; - mScaleArgs[0] = allocScaleArgs(mE_real_buf, mA_real, mB_real, cde_element_op); - mScaleArgs[1] = allocScaleArgs(mE_imag_buf, mA_real, mB_imag, cde_element_op); + mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op); + mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op); mBilinearArgs[0] = allocBilinearArgs( mE_real, mA_imag, mB_imag, - mE_real_buf, + mE_real, BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f}); mBilinearArgs[1] = allocBilinearArgs( mE_imag, mA_imag, mB_real, - mE_imag_buf, + mE_imag, BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f}); } @@ -494,8 +488,6 @@ namespace ck DeviceArray mB_imag; DeviceArray mE_real; DeviceArray mE_imag; - DeviceArray mE_real_buf; - DeviceArray mE_imag_buf; void* mE_grid; index_t elementsE; diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index f4be1a88..cbaee86a 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -1,13 +1,15 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: - - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] - - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] - - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ] - - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] - - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] - - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] + - [ HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ] + - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ] + - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_R_32F ] + - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_R_64F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index f4be1a88..4e640034 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -8,6 +8,8 @@ Tensor Data Types: - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] + - [ HIP_C_32F, HIP_C_32F, NONE_TYPE, HIP_C_32F, HIP_R_32F ] + - [ HIP_C_64F, HIP_C_64F, NONE_TYPE, HIP_C_64F, HIP_R_64F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT From 1bf4a550b2c700901774e68720db66a8b47a21df Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Wed, 20 Dec 2023 12:39:08 -0500 Subject: [PATCH 33/42] Add scalar and complex type instances - Add cpu instances - Add scale complex initial imp - Remove compute f32 instances for complex f32 - Modify multiply utility - Modified bilinear to take complex compute --- .../contraction_cpu_reference_instances.cpp | 56 +++++++++++++++ .../src/contraction/contraction_pack_util.hpp | 50 ++++++++++++- library/src/contraction/device/CMakeLists.txt | 32 ++++----- .../device_contraction_bilinear_complex.hpp | 50 +++++++++---- ..._cf32_cf32_compute_cf32_kknn_instance.cpp} | 10 +-- ..._cf32_cf32_compute_cf32_knnn_instance.cpp} | 10 +-- ..._cf32_cf32_compute_cf32_mknn_instance.cpp} | 10 +-- ..._cf32_cf32_compute_cf32_mnnn_instance.cpp} | 10 +-- ..._cf64_cf64_compute_cf64_kknn_instance.cpp} | 10 +-- ..._cf64_cf64_compute_cf64_knnn_instance.cpp} | 10 +-- ..._cf64_cf64_compute_cf64_mknn_instance.cpp} | 16 ++--- ..._cf64_cf64_compute_cf64_mnnn_instance.cpp} | 10 +-- .../device_contraction_scale_complex.hpp | 71 +++++++++++++------ ...2_cf32_cf32_compute_cf32_kkn_instance.cpp} | 10 +-- ...2_cf32_cf32_compute_cf32_knn_instance.cpp} | 10 +-- ...2_cf32_cf32_compute_cf32_mkn_instance.cpp} | 12 ++-- ...2_cf32_cf32_compute_cf32_mnn_instance.cpp} | 10 +-- ...4_cf64_cf64_compute_cf64_kkn_instance.cpp} | 10 +-- ...4_cf64_cf64_compute_cf64_knn_instance.cpp} | 10 +-- ...4_cf64_cf64_compute_cf64_mkn_instance.cpp} | 10 +-- ...4_cf64_cf64_compute_cf64_mnn_instance.cpp} | 10 +-- 21 files changed, 290 insertions(+), 137 deletions(-) rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp} (92%) rename library/src/contraction/device/{device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp => device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp} (94%) rename library/src/contraction/device/{device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp => device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp} (94%) diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 68b4ad1b..d2fd77fa 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -120,6 +120,20 @@ namespace hiptensor ck::tensor_operation::element_wise::Bilinear, float>()); + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + float, + ck::Tuple, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + hipFloatComplex>()); + // Bilinear f64 registerSolutions( enumerateReferenceSolutions<2, @@ -164,6 +178,20 @@ namespace hiptensor ck::tensor_operation::element_wise::Bilinear, double>()); + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + double, + ck::Tuple, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + hipDoubleComplex>()); + // Scale f16 registerSolutions( enumerateReferenceSolutions<2, @@ -252,6 +280,20 @@ namespace hiptensor ck::tensor_operation::element_wise::Scale, float>()); + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + float, + ck::Tuple<>, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + hipFloatComplex>()); + // Scale f64 registerSolutions( enumerateReferenceSolutions<2, @@ -295,5 +337,19 @@ namespace hiptensor ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Scale, double>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + double, + ck::Tuple<>, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + hipDoubleComplex>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp index f242f3ea..bcc99398 100644 --- a/library/src/contraction/contraction_pack_util.hpp +++ b/library/src/contraction/contraction_pack_util.hpp @@ -33,6 +33,55 @@ namespace hiptensor { + /** + * \brief This function performs multiply-accumulate of the form E = accum * alpha + D * beta + * + */ + template + __global__ void mfma(DataType* mE_real, DataType* mE_imag, DataType* mD_real, DataType* mD_imag, + HIP_vector_type *mE_grid, HIP_vector_type alpha, + HIP_vector_type beta, int length) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if(idx < length) + { + if constexpr(std::is_same_v) + { + mE_grid[idx] = hipCaddf(hipCmulf(make_hipFloatComplex(mE_real[idx], mE_imag[idx]), alpha), + hipCmulf(make_hipFloatComplex(mD_real[idx], mD_imag[idx]), beta)); + } + else if constexpr(std::is_same_v) + { + mE_grid[idx] = hipCadd(hipCmul(make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), alpha), + hipCmul(make_hipDoubleComplex(mD_real[idx], mD_imag[idx]), beta)); + } + } + } + + /** + * \brief This function performs multiply of the form C = accum * alpha + * + */ + template + __global__ void multiply(DataType* mE_real, DataType* mE_imag, HIP_vector_type *mE_grid, + HIP_vector_type alpha, int length) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if(idx < length) + { + if constexpr(std::is_same_v) + { + mE_grid[idx] = hipCmulf(make_hipFloatComplex(mE_real[idx], mE_imag[idx]), alpha); + } + else if constexpr(std::is_same_v) + { + mE_grid[idx] = hipCmul(make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), alpha); + } + } + } + /** * \brief This function unpacks structured data (hipFloatComplex / hipDoubleComplex) * into non-structured data (float / double). @@ -98,4 +147,3 @@ namespace hiptensor } // namespace hiptensor #endif // HIPTENSOR_CONTRACTION_PACK_UTIL_HPP - diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index 3ac03149..b65a8ab1 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -29,14 +29,14 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -65,14 +65,14 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp index 2df240c4..c7a71263 100644 --- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -152,7 +152,7 @@ namespace ck CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, - ComputeDataType, + HIP_vector_type, LoopSched> : public DeviceContractionMultipleD; - using ComplexB = HIP_vector_type; - using ComplexDs = HIP_vector_type; - using ComplexE = HIP_vector_type; + using ComplexA = HIP_vector_type; + using ComplexB = HIP_vector_type; + using ComplexDs = HIP_vector_type; + using ComplexE = HIP_vector_type; + using ComplexCompute = HIP_vector_type; // Internal functional types we will use to // decompose the complex types and operate on. - using DecompA = ADataType; - using DecompB = BDataType; - using DecompDs = DsDataType; - using DecompE = EDataType; + using DecompA = ADataType; + using DecompB = BDataType; + using DecompDs = DsDataType; + using DecompE = EDataType; + using DecompCompute = ComputeDataType; // For complex types, we need to make sure that all of the types are the same static_assert(std::is_same_v && std::is_same_v && std::is_same_v - && std::is_same_v - && std::is_same_v, + && std::is_same_v + && std::is_same_v, "Complex operations must have the same data type"); static_assert(std::is_same_v || std::is_same_v, @@ -297,6 +299,8 @@ namespace ck elementsE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); + element_op = cde_element_op; + mA_real.reset(nullptr); mA_imag.reset(nullptr); mB_real.reset(nullptr); @@ -367,6 +371,19 @@ namespace ck cde_element_op); }; + mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, CDEElementwiseOperation{1.0f, 1.0f}); + mArgs[1] = allocArgs(mE_real, + mA_imag, + mB_imag, + mE_real, + CDEElementwiseOperation{-1.0f, + 1.0f}); + mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, CDEElementwiseOperation{1.0f, 1.0f}); + mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, + CDEElementwiseOperation{1.0f , 1.0f}); + + // original + /* TODO :Uncomment once done mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op); mArgs[1] = allocArgs(mE_real, mA_imag, @@ -376,7 +393,7 @@ namespace ck 1.0f}); mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op); mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, - CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f}); + CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f});*/ } void Print() const @@ -408,6 +425,7 @@ namespace ck DeviceArray mE_real; DeviceArray mE_imag; + CDEElementwiseOperation element_op{1.0f, 1.0f}; void* mE_grid; index_t elementsE; }; @@ -448,8 +466,12 @@ namespace ck { auto blockDim = dim3(1024); auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); - hiptensor::pack<<>>( - arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); + hiptensor::mfma<<>>( + arg.mE_real.get(), arg.mE_imag.get(), arg.mD_real.get(), arg.mD_imag.get(), + ((ComplexE*)arg.mE_grid), arg.element_op.alpha_, arg.element_op.beta_, + arg.elementsE); + //hiptensor::pack<<>>( + // arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); } return r0 + r1 + r2 + r3; diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp index 03514f47..02e3834e 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp @@ -53,20 +53,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E - using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance = device_contraction_kk_instance; void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance( std::vector>>& instances) + CF32>>>& instances) { add_device_operation_instances( instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance{}); + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp index bb1ccde5..742d49a6 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E - using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance = device_contraction_kn_instance; void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance( std::vector>>& instances) + CF32>>>& instances) { add_device_operation_instances( instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance{}); + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp index 2d47acc0..0f6b19d1 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E - using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance = device_contraction_mk_instance; void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance( std::vector>>& instances) + CF32>>>& instances) { add_device_operation_instances( instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance{}); + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp index 4c881c0a..184aea57 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E - using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance = device_contraction_mn_instance; void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance( std::vector>>& instances) + CF32>>>& instances) { add_device_operation_instances( instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance{}); + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp index ed2ba843..5be10230 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp @@ -53,20 +53,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E - using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance = device_contraction_f64_kk_instance; void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance( std::vector>>& instances) + CF64>>>& instances) { add_device_operation_instances( instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance{}); + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp index 03dd9293..bf5c1667 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E - using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance = device_contraction_f64_kn_instance; void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance( std::vector>>& instances) + CF64>>>& instances) { add_device_operation_instances( instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance{}); + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp similarity index 92% rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp index c44a5daf..e07e603e 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp @@ -52,35 +52,35 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E - using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance = device_contraction_f64_mk_instance; void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance( std::vector>>& instances) + PassThrough, + Bilinear, + CF64>>>& instances) { add_device_operation_instances( instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance{}); + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp rename to library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp index d045a404..3329307a 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E - using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance = device_contraction_f64_mn_instance; void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance( std::vector>>& instances) + CF64>>>& instances) { add_device_operation_instances( instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance{}); + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp index f3a7fd2e..43a9358c 100644 --- a/library/src/contraction/device/device_contraction_scale_complex.hpp +++ b/library/src/contraction/device/device_contraction_scale_complex.hpp @@ -43,8 +43,8 @@ namespace ck using hiptensor::DeviceDeleter; using hiptensor::elementSpaceFromLengthsAndStrides; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - using Scale = ck::tensor_operation::element_wise::Scale; + using Bilinear = ck::tensor_operation::element_wise::Bilinear; + using Scale = ck::tensor_operation::element_wise::Scale; // The following is a specialization class for bilinear contractions of complex types. // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of @@ -152,7 +152,7 @@ namespace ck CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, - ComputeDataType, + HIP_vector_type, LoopSched> : public DeviceContractionMultipleD + HIP_vector_type> { // Complex device Op using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; @@ -175,22 +175,24 @@ namespace ck using BilinearCDEElementwiseOperation = Bilinear; // Complex types given through the interface - using ComplexA = HIP_vector_type; - using ComplexB = HIP_vector_type; - using ComplexDs = HIP_vector_type; - using ComplexE = HIP_vector_type; + using ComplexA = HIP_vector_type; + using ComplexB = HIP_vector_type; + using ComplexDs = HIP_vector_type; + using ComplexE = HIP_vector_type; + using ComplexCompute = HIP_vector_type; // Internal functional types we will use to // decompose the complex types and operate on. - using DecompA = ADataType; - using DecompB = BDataType; - using DecompDs = EDataType; - using DecompE = EDataType; + using DecompA = ADataType; + using DecompB = BDataType; + using DecompDs = EDataType; + using DecompE = EDataType; + using DecompCompute = ComputeDataType; // For complex types, we need to make sure that all of the types are the same static_assert(std::is_same_v && std::is_same_v - && std::is_same_v - && std::is_same_v, + && std::is_same_v + && std::is_same_v, "Complex operations must have the same data type"); static_assert(std::is_same_v || std::is_same_v, @@ -243,7 +245,7 @@ namespace ck CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, - ComputeDataType, + DecompCompute, LoopSched>; // The internal operation that we will decompose the complex operations with. @@ -291,7 +293,7 @@ namespace ck CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, - ComputeDataType, + DecompCompute, LoopSched>; // Argument @@ -314,8 +316,8 @@ namespace ck { mScaleArgs[0] = std::move(other.mScaleArgs[0]); mScaleArgs[1] = std::move(other.mScaleArgs[1]); - mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); - mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); + mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); + mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); } return *this; } @@ -346,6 +348,8 @@ namespace ck elementsE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); + element_op = cde_element_op; + mA_real.reset(nullptr); mA_imag.reset(nullptr); mB_real.reset(nullptr); @@ -445,7 +449,26 @@ namespace ck cde_element_op); }; - mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op); + mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, ScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[0] = allocBilinearArgs( + mE_real, + mA_imag, + mB_imag, + mE_real, + BilinearCDEElementwiseOperation{-1.0f, 1.0f}); + + mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, ScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[1] = allocBilinearArgs( + mE_imag, + mA_imag, + mB_real, + mE_imag, + BilinearCDEElementwiseOperation{1.0f, 1.0f}); + + + // TODO UNCOMMENT WHEN DONE + // original + /*mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op); mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op); mBilinearArgs[0] = allocBilinearArgs( mE_real, @@ -458,7 +481,7 @@ namespace ck mA_imag, mB_real, mE_imag, - BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f}); + BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});*/ } void Print() const @@ -489,6 +512,7 @@ namespace ck DeviceArray mE_real; DeviceArray mE_imag; + ScaleCDEElementwiseOperation element_op{1.0}; void* mE_grid; index_t elementsE; }; @@ -532,8 +556,11 @@ namespace ck { auto blockDim = dim3(1024); auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); - hiptensor::pack<<>>( - arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); + + hiptensor::multiply<<>>( + arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.element_op.scale_, arg.elementsE); + //hiptensor::pack<<>>( + // arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); } return r0 + r1 + r2 + r3; diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp index 3352556d..9e9c8f9a 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp @@ -54,19 +54,19 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance = device_contraction_kk_instance; void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance( std::vector>>& instances) + CF32>>>& instances) { add_device_operation_instances( instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance{}); + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp index cfd6c7f4..b9183a21 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance = device_contraction_kn_instance; void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance( std::vector>>& instances) + CF32>>>& instances) { add_device_operation_instances( instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance{}); + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp index eacc1148..1f87031d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance = device_contraction_mk_instance; void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance( std::vector>>& instances) + CF32>>>& instances) { add_device_operation_instances( instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance{}); + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp index b5e79372..ef7724e0 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance = device_contraction_mn_instance; void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance( std::vector>>& instances) + CF32>>>& instances) { add_device_operation_instances( instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance{}); + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp index c0934498..e22aab5f 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp @@ -54,20 +54,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance = device_contraction_f64_kk_instance; void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance( std::vector>>& instances) + CF64>>>& instances) { add_device_operation_instances( instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance{}); + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp index 8514cb70..58ed790a 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance = device_contraction_f64_kn_instance; void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance( std::vector>>& instances) + CF64>>>& instances) { add_device_operation_instances( instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance{}); + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp index 09d589d6..562519f5 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance = device_contraction_f64_mk_instance; void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance( std::vector>>& instances) + CF64>>>& instances) { add_device_operation_instances( instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance{}); + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance{}); } } // namespace instance diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp similarity index 94% rename from library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp rename to library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp index 6b90050b..724d89cf 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp @@ -52,20 +52,20 @@ namespace ck // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E - using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance = device_contraction_f64_mn_instance; void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance( std::vector>>& instances) + CF64>>>& instances) { add_device_operation_instances( instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance{}); + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance{}); } } // namespace instance } // namespace device From b50a1597edc306a24dd7580ca83f9d4e9f45836e Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Thu, 21 Dec 2023 12:20:48 -0500 Subject: [PATCH 34/42] Add complex bilinear and scale structures --- .../device/device_element_wise_complex.hpp | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 library/src/contraction/device/device_element_wise_complex.hpp diff --git a/library/src/contraction/device/device_element_wise_complex.hpp b/library/src/contraction/device/device_element_wise_complex.hpp new file mode 100644 index 00000000..6dfd94e0 --- /dev/null +++ b/library/src/contraction/device/device_element_wise_complex.hpp @@ -0,0 +1,108 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#ifndef HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP +#define HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP + +#include +#include +#include + +namespace ck { +namespace tensor_operation { +namespace element_wise { + +struct ScaleComplex : public Scale +{ + __host__ __device__ ScaleComplex(hipFloatComplex scale) : Scale(hipCrealf(scale)) + { + scale_ = hipComplexFloatToDouble(scale); + } + + __host__ __device__ ScaleComplex(hipDoubleComplex scale) : Scale(hipCreal(scale)) + { + scale_ = scale; + } + + template + __host__ __device__ void operator()(Y& y, const X& x) const; + + template <> + __host__ __device__ void operator()(hipFloatComplex& y, const hipFloatComplex& x) const + { + y = hipCmulf(hipComplexDoubleToFloat(scale_), x); + }; + + template <> + __host__ __device__ void operator()(hipDoubleComplex& y, const hipDoubleComplex& x) const + { + y = hipCmul(scale_, x); + }; + + // complex * float + hipDoubleComplex scale_; +}; + +struct BilinearComplex : public Bilinear +{ + BilinearComplex(hipFloatComplex alpha, hipFloatComplex beta) : Bilinear(hipCrealf(alpha), hipCrealf(beta)) + { + alpha_ = hipComplexFloatToDouble(alpha); + beta_ = hipComplexFloatToDouble(beta); + } + + BilinearComplex(hipDoubleComplex alpha, hipDoubleComplex beta) : Bilinear(hipCreal(alpha), hipCreal(beta)) + { + alpha_ = alpha; + beta_ = beta; + } + + template + __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&) const; + + template <> + __host__ __device__ constexpr void + operator()(hipDoubleComplex& y, const hipDoubleComplex& x0, const hipDoubleComplex& x1) const + { + y = hipCadd(hipCmul(alpha_, x0), hipCmul(beta_, x1)); + }; + + template <> + __host__ __device__ constexpr void + operator()(hipFloatComplex& y, const hipFloatComplex& x0, const hipFloatComplex& x1) const + { + y = hipCaddf(hipCmulf(hipComplexDoubleToFloat(alpha_), x0), hipCmulf(hipComplexDoubleToFloat(beta_), x1)); + }; + + hipDoubleComplex alpha_; + hipDoubleComplex beta_; +}; + +} // namespace element_wise +} // namespace tensor_operation +} // namespace ck + +#endif // HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP From b85aa93109b173a3066841eeb0d1d426b7fec539 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 21 Dec 2023 18:28:58 +0000 Subject: [PATCH 35/42] Support complex alpha and beta in contraction --- library/include/hiptensor/hiptensor_types.hpp | 2 + .../src/contraction/contraction_selection.cpp | 21 +++- .../contraction/contraction_solution_impl.hpp | 16 +-- library/src/data_types.cpp | 98 ++++++++++++++++++- library/src/include/data_types.hpp | 45 +++++++++ library/src/include/data_types_impl.hpp | 6 +- test/01_contraction/CMakeLists.txt | 6 ++ .../complex_bilinear_contraction_test.cpp | 48 +++++++++ .../configs/complex_bilinear_test_params.yaml | 37 +++++++ 9 files changed, 263 insertions(+), 16 deletions(-) create mode 100644 test/01_contraction/complex_bilinear_contraction_test.cpp create mode 100644 test/01_contraction/configs/complex_bilinear_test_params.yaml diff --git a/library/include/hiptensor/hiptensor_types.hpp b/library/include/hiptensor/hiptensor_types.hpp index 85a5d90e..ca666a5b 100644 --- a/library/include/hiptensor/hiptensor_types.hpp +++ b/library/include/hiptensor/hiptensor_types.hpp @@ -90,6 +90,8 @@ typedef enum HIPTENSOR_COMPUTE_8I = (1U << 8U), HIPTENSOR_COMPUTE_32U = (1U << 7U), HIPTENSOR_COMPUTE_32I = (1U << 9U), + HIPTENSOR_COMPUTE_C32F = (1U << 11U), + HIPTENSOR_COMPUTE_C64F = (1U << 12U), HIPTENSOR_COMPUTE_NONE = 0 } hiptensorComputeType_t; diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index 9b0cdf9f..b2e54d80 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -81,10 +81,23 @@ namespace hiptensor * ``` * Hence, the `alpha` and `bete` need to point to a ComputeData value */ - double alpha = 0.0; - double beta = 0.0; - writeVal(&alpha, computeType, 1.02); - writeVal(&beta, computeType, 1.03); + hipDoubleComplex alpha; + hipDoubleComplex beta; + if(computeType == HIPTENSOR_COMPUTE_C32F) + { + writeVal(&alpha, computeType, hipFloatComplex{1.02, 1.03}); + writeVal(&beta, computeType, hipFloatComplex{1.04, 1.05}); + } + else if(computeType == HIPTENSOR_COMPUTE_C64F) + { + writeVal(&alpha, computeType, hipDoubleComplex{1.02, 1.03}); + writeVal(&beta, computeType, hipDoubleComplex{1.04, 1.05}); + } + else + { + writeVal(&alpha, computeType, 1.02); + writeVal(&beta, computeType, 1.03); + } CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA)); CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB)); diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 263937c3..33b6a85e 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -90,17 +90,17 @@ namespace hiptensor auto* deviceOp = dynamic_cast(Base::mDeviceOp.get()); // Note: CK ALWAYS uses float for alpha / beta in contraction multipleD - auto alphaF = 0.0f; - auto betaF = 0.0f; + ScalarData alphaF; + ScalarData betaF; if(alpha != nullptr) { - alphaF = hiptensor::readVal( + alphaF = hiptensor::readVal( alpha, convertToComputeType(HipDataType_v)); } if(beta != nullptr) { - betaF = hiptensor::readVal( + betaF = hiptensor::readVal( beta, convertToComputeType(HipDataType_v)); } @@ -125,7 +125,7 @@ namespace hiptensor toCKVec(e_ms_ns_strides), typename Traits::AOp{}, typename Traits::BOp{}, - typename Traits::CDEOp{alphaF, betaF})); + typename Traits::CDEOp(alphaF, betaF))); // Attach the workspace pointer deviceOp->SetWorkSpacePointer(Base::mArgPtr.get(), workspacePtr); @@ -203,11 +203,11 @@ namespace hiptensor auto* deviceOp = dynamic_cast(Base::mDeviceOp.get()); // Note: CK ALWAYS uses float for alpha / beta in contraction multipleD - auto alphaF = 0.0f; + ScalarData alphaF; if(alpha != nullptr) { - alphaF = hiptensor::readVal( + alphaF = hiptensor::readVal( alpha, convertToComputeType(HipDataType_v)); } @@ -232,7 +232,7 @@ namespace hiptensor toCKVec(e_ms_ns_strides), typename Traits::AOp{}, typename Traits::BOp{}, - typename Traits::CDEOp{alphaF})); + typename Traits::CDEOp(alphaF))); // Attach the workspace pointer deviceOp->SetWorkSpacePointer(Base::mArgPtr.get(), workspacePtr); diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp index 09df158f..abaf7154 100644 --- a/library/src/data_types.cpp +++ b/library/src/data_types.cpp @@ -110,11 +110,11 @@ namespace hiptensor { return HIPTENSOR_COMPUTE_16F; } - else if(hipType == HIP_R_32F || hipType == HIP_C_32F) + else if(hipType == HIP_R_32F) { return HIPTENSOR_COMPUTE_32F; } - else if(hipType == HIP_R_64F || hipType == HIP_C_64F) + else if(hipType == HIP_R_64F) { return HIPTENSOR_COMPUTE_64F; } @@ -134,12 +134,72 @@ namespace hiptensor { return HIPTENSOR_COMPUTE_32U; } + else if(hipType == HIP_C_32F) + { + return HIPTENSOR_COMPUTE_C32F; + } + else if(hipType == HIP_C_64F) + { + return HIPTENSOR_COMPUTE_C64F; + } else { return HIPTENSOR_COMPUTE_NONE; } } + template <> + ScalarData readVal(void const* value, hiptensorComputeType_t id) + { + if(id == HIPTENSOR_COMPUTE_16F) + { + return ScalarData(*(_Float16*)value, id); + } + else if(id == HIPTENSOR_COMPUTE_16BF) + { + return ScalarData(*(hip_bfloat16*)value, id); + } + else if(id == HIPTENSOR_COMPUTE_32F) + { + return ScalarData(*(float*)value, id); + } + else if(id == HIPTENSOR_COMPUTE_64F) + { + return ScalarData(*(double*)value, id); + } + else if(id == HIPTENSOR_COMPUTE_8U) + { + return ScalarData(*(uint8_t*)value, id); + } + else if(id == HIPTENSOR_COMPUTE_8I) + { + return ScalarData(*(int8_t*)value, id); + } + else if(id == HIPTENSOR_COMPUTE_32U) + { + return ScalarData(*(uint32_t*)value, id); + } + else if(id == HIPTENSOR_COMPUTE_32I) + { + return ScalarData(*(int32_t*)value, id); + } + else if(id == HIPTENSOR_COMPUTE_C32F) + { + return {*(hipFloatComplex*)value, id}; + } + else if(id == HIPTENSOR_COMPUTE_C64F) + { + return {*(hipDoubleComplex*)value, id}; + } + else + { +#if !NDEBUG + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; +#endif // !NDEBUG + return {0, HIPTENSOR_COMPUTE_NONE}; + } + } + void writeVal(void const* addr, hiptensorComputeType_t id, double value) { if(id == HIPTENSOR_COMPUTE_16F) @@ -183,6 +243,40 @@ namespace hiptensor } } + void writeVal(void const* addr, hiptensorComputeType_t id, hipFloatComplex value) + { + if(id == HIPTENSOR_COMPUTE_C32F) + { + *(hipFloatComplex*)addr = value; + } + else + { +#if !NDEBUG + std::cout << "Data type is hipFloatComplex, but hiptensorComputeType_t is not " + "HIPTENSOR_COMPUTE_C32F: " + << id << std::endl; +#endif // !NDEBUG + return; + } + } + + void writeVal(void const* addr, hiptensorComputeType_t id, hipDoubleComplex value) + { + if(id == HIPTENSOR_COMPUTE_C64F) + { + *(hipDoubleComplex*)addr = value; + } + else + { +#if !NDEBUG + std::cout << "Data type is hipDoubleComplex, but hiptensorComputeType_t is not " + "HIPTENSOR_COMPUTE_C64F: " + << id << std::endl; +#endif // !NDEBUG + return; + } + } + } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType) diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp index 97402fa3..aa2eaa40 100644 --- a/library/src/include/data_types.hpp +++ b/library/src/include/data_types.hpp @@ -44,6 +44,49 @@ namespace hiptensor // Used to map to empty tensors struct NoneType; + struct ScalarData + { + hiptensorComputeType_t type; + union + { + double real; + hipDoubleComplex complex; + }; + + ScalarData() = default; + ScalarData(double value, hiptensorComputeType_t type) + : real(value) + , type(type) + { + } + ScalarData(hipFloatComplex value, hiptensorComputeType_t type) + : complex(hipComplexFloatToDouble(value)) + , type(type) + { + } + ScalarData(hipDoubleComplex value, hiptensorComputeType_t type) + : complex(value) + , type(type) + { + } + operator float() const + { + return static_cast(real); + } + operator double() const + { + return real; + } + operator hipFloatComplex() const + { + return hipComplexDoubleToFloat(complex); + } + operator hipDoubleComplex() const + { + return complex; + } + }; + static constexpr hipDataType NONE_TYPE = (hipDataType)31; // Map type to runtime HipDataType @@ -67,6 +110,8 @@ namespace hiptensor T readVal(void const* value, hiptensorComputeType_t id); void writeVal(void const* addr, hiptensorComputeType_t id, double value); + void writeVal(void const* addr, hiptensorComputeType_t id, hipDoubleComplex value); + void writeVal(void const* addr, hiptensorComputeType_t id, hipFloatComplex value); } // namespace hiptensor diff --git a/library/src/include/data_types_impl.hpp b/library/src/include/data_types_impl.hpp index ef3e7c77..c55f0d7e 100644 --- a/library/src/include/data_types_impl.hpp +++ b/library/src/include/data_types_impl.hpp @@ -174,11 +174,11 @@ namespace hiptensor { return static_cast(*(uint64_t*)value); } - else if constexpr(std::is_same_v && id == HIP_C_32F) + else if constexpr(std::is_same_v && id == HIP_C_32F) { return static_cast(*(hipFloatComplex*)value); } - else if constexpr(std::is_same_v && id == HIP_C_64F) + else if constexpr(std::is_same_v && id == HIP_C_64F) { return static_cast(*(hipDoubleComplex*)value); } @@ -235,6 +235,8 @@ namespace hiptensor } } + template <> + ScalarData readVal(void const* value, hiptensorComputeType_t id); } // namespace hiptensor #endif // HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP diff --git a/test/01_contraction/CMakeLists.txt b/test/01_contraction/CMakeLists.txt index fe2d7a87..1e0e3c0a 100644 --- a/test/01_contraction/CMakeLists.txt +++ b/test/01_contraction/CMakeLists.txt @@ -33,6 +33,12 @@ set (BilinearContractionTestSources ${ContractionCommonSources} set (BilinearContractionTestConfig ${CMAKE_CURRENT_SOURCE_DIR}/configs/bilinear_test_params.yaml) add_hiptensor_test(bilinear_contraction_test ${BilinearContractionTestConfig} ${BilinearContractionTestSources}) +# Complex Bilinear tests +set (ComplexBilinearContractionTestSources ${ContractionCommonSources} + ${CMAKE_CURRENT_SOURCE_DIR}/complex_bilinear_contraction_test.cpp) +set (ComplexBilinearContractionTestConfig ${CMAKE_CURRENT_SOURCE_DIR}/configs/complex_bilinear_test_params.yaml) +add_hiptensor_test(complex_bilinear_contraction_test ${ComplexBilinearContractionTestConfig} ${ComplexBilinearContractionTestSources}) + # Scale tests set (ScaleContractionTestSources ${ContractionCommonSources} ${CMAKE_CURRENT_SOURCE_DIR}/scale_contraction_test.cpp) diff --git a/test/01_contraction/complex_bilinear_contraction_test.cpp b/test/01_contraction/complex_bilinear_contraction_test.cpp new file mode 100644 index 00000000..51e95c34 --- /dev/null +++ b/test/01_contraction/complex_bilinear_contraction_test.cpp @@ -0,0 +1,48 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include "contraction_test.hpp" +#include "contraction_test_helpers.hpp" + +class ComplexBilinearContractionTest : public hiptensor::ContractionTest +{ +}; + +TEST_P(ComplexBilinearContractionTest, RunKernel) +{ + static bool ranWarmup = false; + if(!ranWarmup) + { + this->Warmup(); + ranWarmup = true; + } + this->RunKernel(); +} + +INSTANTIATE_TEST_SUITE_P(ContractionTests, ComplexBilinearContractionTest, load_config_helper()); diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml new file mode 100644 index 00000000..cbaee86a --- /dev/null +++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml @@ -0,0 +1,37 @@ +--- +Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] +Tensor Data Types: + - [ HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ] + - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ] + - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_R_32F ] + - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_R_64F ] +Algorithm Types: + - HIPTENSOR_ALGO_DEFAULT + - HIPTENSOR_ALGO_DEFAULT_PATIENT + # - HIPTENSOR_ALGO_ACTOR_CRITIC +Operators: + - HIPTENSOR_OP_IDENTITY +Worksize Prefs: + - HIPTENSOR_WORKSPACE_RECOMMENDED + - HIPTENSOR_WORKSPACE_MIN + - HIPTENSOR_WORKSPACE_MAX +Alphas: + - 0 + - 1 + - 1 +Betas: + - 2 + - 0 + - 2 +Lengths: + - [ 5, 6, 3, 4, 3, 4 ] + - [ 4, 3, 4, 3, 6, 5 ] + - [ 24, 18, 2, 4, 9, 2 ] +Strides: + - [] +... From 1b2031ecfdfb6e614c081853d86c3b070a21fbf3 Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Thu, 21 Dec 2023 13:40:44 -0500 Subject: [PATCH 36/42] Modify base class compute type --- .../device/device_contraction_bilinear_complex.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp index c7a71263..fc050e50 100644 --- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -165,7 +165,7 @@ namespace ck AElementwiseOperation, BElementwiseOperation, Bilinear, - ComputeDataType> + HIP_vector_type> { // Complex device Op using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; @@ -243,7 +243,7 @@ namespace ck CShuffleNXdlPerWavePerShuffle, CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEBlockTransferScalarPerVector_NPerBlock, - ComputeDataType, + DecompCompute, LoopSched>; // Argument From 41c44b26d3d9e4cfb17402418f4a88bba6338fef Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Thu, 21 Dec 2023 14:13:12 -0500 Subject: [PATCH 37/42] Modify cpu and gpu device instances for complex --- .../contraction_cpu_reference_impl.hpp | 38 +++++++++- .../contraction_cpu_reference_instances.cpp | 64 +--------------- .../contraction/contraction_meta_traits.hpp | 24 ++++-- .../src/contraction/contraction_pack_util.hpp | 31 +++++--- .../src/contraction/contraction_solution.hpp | 2 + .../contraction/contraction_solution_impl.hpp | 12 ++- .../contraction_solution_instances.cpp | 16 ++-- library/src/contraction/contraction_types.hpp | 2 + .../contraction/contraction_types_impl.hpp | 13 ++++ library/src/contraction/device/common.hpp | 2 + .../device_contraction_bilinear_complex.hpp | 30 ++++---- ...2_cf32_cf32_compute_cf32_kknn_instance.cpp | 11 +-- ...2_cf32_cf32_compute_cf32_knnn_instance.cpp | 11 +-- ...2_cf32_cf32_compute_cf32_mknn_instance.cpp | 11 +-- ...2_cf32_cf32_compute_cf32_mnnn_instance.cpp | 11 +-- ...4_cf64_cf64_compute_cf64_kknn_instance.cpp | 11 +-- ...4_cf64_cf64_compute_cf64_knnn_instance.cpp | 11 +-- ...4_cf64_cf64_compute_cf64_mknn_instance.cpp | 11 +-- ...4_cf64_cf64_compute_cf64_mnnn_instance.cpp | 11 +-- .../device_contraction_scale_complex.hpp | 30 ++++---- ...32_cf32_cf32_compute_cf32_kkn_instance.cpp | 11 +-- ...32_cf32_cf32_compute_cf32_knn_instance.cpp | 11 +-- ...32_cf32_cf32_compute_cf32_mkn_instance.cpp | 11 +-- ...32_cf32_cf32_compute_cf32_mnn_instance.cpp | 11 +-- ...64_cf64_cf64_compute_cf64_kkn_instance.cpp | 11 +-- ...64_cf64_cf64_compute_cf64_knn_instance.cpp | 11 +-- ...64_cf64_cf64_compute_cf64_mkn_instance.cpp | 11 +-- ...64_cf64_cf64_compute_cf64_mnn_instance.cpp | 11 +-- ...device_element_wise_operation_complex.hpp} | 17 +---- ...ptensor_contraction_bilinear_instances.hpp | 76 ++++++++++--------- .../hiptensor_contraction_scale_instances.hpp | 74 +++++++++--------- library/src/include/meta_traits.hpp | 2 +- 32 files changed, 323 insertions(+), 286 deletions(-) rename library/src/contraction/device/{device_element_wise_complex.hpp => device_element_wise_operation_complex.hpp} (87%) diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index 25c317e3..2f031bb0 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -200,7 +200,20 @@ namespace hiptensor { ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.scale_ * (EDataType)accum; } - else // bilinear + else if constexpr(std::is_same_v) + { + if constexpr(std::is_same_v) + { + ((EDataType*)arg.mE)[indexE] = hipCmulf(hipComplexDoubleToFloat(arg.mOpCDE.scale_), (EDataType)accum); + } + else + { + ((EDataType*)arg.mE)[indexE] = hipCmul(arg.mOpCDE.scale_, (EDataType)accum); + } + } + else if constexpr(std::is_same_v) { // NumDTensor will be 1 due to SFINAE of this class auto indexD @@ -209,6 +222,29 @@ namespace hiptensor ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.alpha_ * (EDataType)accum + arg.mOpCDE.beta_ * ((EDataType*)(arg.mD[0]))[indexD]; } + else if constexpr(std::is_same_v) + { + // NumDTensor will be 1 due to SFINAE of this class + auto indexD + = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); + + if constexpr(std::is_same_v) + { + ((EDataType*)arg.mE)[indexE] = hipCaddf( + hipCmulf( + hipComplexDoubleToFloat(arg.mOpCDE.alpha_), + (EDataType)accum), + hipCmulf( + hipComplexDoubleToFloat(arg.mOpCDE.beta_), + ((EDataType*)(arg.mD[0]))[indexD])); + } + else + { + ((EDataType*)arg.mE)[indexE] = hipCadd(hipCmul(arg.mOpCDE.alpha_, (EDataType)accum), + hipCmul(arg.mOpCDE.beta_, ((EDataType*)(arg.mD[0]))[indexD])); + } + } }; make_ParallelTensorFunctor(f_ms_ns_complex, diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index d2fd77fa..60c1ce49 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -117,21 +117,7 @@ namespace hiptensor hipFloatComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear, - float>()); - - registerSolutions( - enumerateReferenceSolutions<2, - 2, - 2, - hipFloatComplex, - hipFloatComplex, - float, - ck::Tuple, - hipFloatComplex, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear, + ck::tensor_operation::element_wise::BilinearComplex, hipFloatComplex>()); // Bilinear f64 @@ -175,21 +161,7 @@ namespace hiptensor hipDoubleComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear, - double>()); - - registerSolutions( - enumerateReferenceSolutions<2, - 2, - 2, - hipDoubleComplex, - hipDoubleComplex, - double, - ck::Tuple, - hipDoubleComplex, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear, + ck::tensor_operation::element_wise::BilinearComplex, hipDoubleComplex>()); // Scale f16 @@ -277,21 +249,7 @@ namespace hiptensor hipFloatComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, - float>()); - - registerSolutions( - enumerateReferenceSolutions<2, - 2, - 2, - hipFloatComplex, - hipFloatComplex, - float, - ck::Tuple<>, - hipFloatComplex, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, + ck::tensor_operation::element_wise::ScaleComplex, hipFloatComplex>()); // Scale f64 @@ -335,21 +293,7 @@ namespace hiptensor hipDoubleComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, - double>()); - - registerSolutions( - enumerateReferenceSolutions<2, - 2, - 2, - hipDoubleComplex, - hipDoubleComplex, - double, - ck::Tuple<>, - hipDoubleComplex, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, + ck::tensor_operation::element_wise::ScaleComplex, hipDoubleComplex>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index e66ac432..48508c6e 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -34,12 +34,12 @@ #include // hiptensor includes +#include "device/device_element_wise_operation_complex.hpp" #include "data_types.hpp" #include "meta_traits.hpp" namespace hiptensor { - // Partial specialize for Bilinear contraction template struct MetaTraits> + CDEElementwiseOperation, + ComputeDataType>, + std::enable_if_t<(std::is_same_v) || + (std::is_same_v)>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; @@ -88,7 +93,7 @@ namespace hiptensor ComputeDataType>; using AOp = AElementwiseOperation; using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Bilinear; + using CDEOp = CDEElementwiseOperation; }; // Partial specialize for Scale contraction @@ -100,6 +105,7 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, + typename CDEElementwiseOperation, typename ComputeDataType> struct MetaTraits> + CDEElementwiseOperation, + ComputeDataType>, + std::enable_if_t<(std::is_same_v) || + (std::is_same_v)>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; @@ -129,7 +139,7 @@ namespace hiptensor ComputeDataType>; using AOp = AElementwiseOperation; using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Scale; + using CDEOp = CDEElementwiseOperation; }; } // namespace hiptensor diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp index bcc99398..237e9d7f 100644 --- a/library/src/contraction/contraction_pack_util.hpp +++ b/library/src/contraction/contraction_pack_util.hpp @@ -39,8 +39,8 @@ namespace hiptensor */ template __global__ void mfma(DataType* mE_real, DataType* mE_imag, DataType* mD_real, DataType* mD_imag, - HIP_vector_type *mE_grid, HIP_vector_type alpha, - HIP_vector_type beta, int length) + HIP_vector_type *mE_grid, HIP_vector_type alpha, + HIP_vector_type beta, int length) { int idx = threadIdx.x + blockIdx.x * blockDim.x; @@ -48,13 +48,22 @@ namespace hiptensor { if constexpr(std::is_same_v) { - mE_grid[idx] = hipCaddf(hipCmulf(make_hipFloatComplex(mE_real[idx], mE_imag[idx]), alpha), - hipCmulf(make_hipFloatComplex(mD_real[idx], mD_imag[idx]), beta)); + mE_grid[idx] = hipCaddf( + hipCmulf( + make_hipFloatComplex(mE_real[idx], mE_imag[idx]), + hipComplexDoubleToFloat(alpha)), + hipCmulf( + make_hipFloatComplex(mD_real[idx], mD_imag[idx]), + hipComplexDoubleToFloat(beta))); } else if constexpr(std::is_same_v) { - mE_grid[idx] = hipCadd(hipCmul(make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), alpha), - hipCmul(make_hipDoubleComplex(mD_real[idx], mD_imag[idx]), beta)); + mE_grid[idx] = hipCadd(hipCmul( + make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), + alpha), + hipCmul( + make_hipDoubleComplex(mD_real[idx], mD_imag[idx]), + beta)); } } } @@ -65,7 +74,7 @@ namespace hiptensor */ template __global__ void multiply(DataType* mE_real, DataType* mE_imag, HIP_vector_type *mE_grid, - HIP_vector_type alpha, int length) + HIP_vector_type alpha, int length) { int idx = threadIdx.x + blockIdx.x * blockDim.x; @@ -73,11 +82,15 @@ namespace hiptensor { if constexpr(std::is_same_v) { - mE_grid[idx] = hipCmulf(make_hipFloatComplex(mE_real[idx], mE_imag[idx]), alpha); + mE_grid[idx] = hipCmulf( + make_hipFloatComplex(mE_real[idx], mE_imag[idx]), + hipComplexDoubleToFloat(alpha)); } else if constexpr(std::is_same_v) { - mE_grid[idx] = hipCmul(make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), alpha); + mE_grid[idx] = hipCmul( + make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), + alpha); } } } diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp index e76bb351..97dde1ca 100644 --- a/library/src/contraction/contraction_solution.hpp +++ b/library/src/contraction/contraction_solution.hpp @@ -38,6 +38,8 @@ #include #include +#include "device/device_element_wise_operation_complex.hpp" + #include "contraction_meta_traits.hpp" #include "contraction_solution_params.hpp" #include "performance.hpp" diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 33b6a85e..09e300a7 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -52,8 +52,10 @@ namespace hiptensor template class ContractionSolutionImpl< DeviceOp, - std::enable_if_t::CDEOp, - ck::tensor_operation::element_wise::Bilinear>>> + std::enable_if_t<(std::is_same_v::CDEOp, + ck::tensor_operation::element_wise::Bilinear>) + || (std::is_same_v::CDEOp, + ck::tensor_operation::element_wise::BilinearComplex>)>> : public ContractionSolution { public: @@ -165,8 +167,10 @@ namespace hiptensor template class ContractionSolutionImpl< DeviceOp, - std::enable_if_t::CDEOp, - ck::tensor_operation::element_wise::Scale>>> + std::enable_if_t<(std::is_same_v::CDEOp, + ck::tensor_operation::element_wise::Scale>) + || (std::is_same_v::CDEOp, + ck::tensor_operation::element_wise::ScaleComplex>)>> : public ContractionSolution { public: diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index 2cec41bc..ad5b4408 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -116,8 +116,8 @@ namespace hiptensor hipFloatComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear, - float>()); + ck::tensor_operation::element_wise::BilinearComplex, + hipFloatComplex>()); // Bilinear f64 registerSolutions( @@ -156,8 +156,8 @@ namespace hiptensor hipDoubleComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear, - double>()); + ck::tensor_operation::element_wise::BilinearComplex, + hipDoubleComplex>()); // Scale bf16 registerSolutions( @@ -238,8 +238,8 @@ namespace hiptensor hipFloatComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, - float>()); + ck::tensor_operation::element_wise::ScaleComplex, + hipFloatComplex>()); // Scale f64 registerSolutions( @@ -278,8 +278,8 @@ namespace hiptensor hipDoubleComplex, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, - double>()); + ck::tensor_operation::element_wise::ScaleComplex, + hipDoubleComplex>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_types.hpp b/library/src/contraction/contraction_types.hpp index 101d72dc..e4930726 100644 --- a/library/src/contraction/contraction_types.hpp +++ b/library/src/contraction/contraction_types.hpp @@ -40,6 +40,8 @@ namespace hiptensor { SCALE = 0, ///< \f${C=\alpha\mathcal{A}\mathcal{B}}\f$ BILINEAR = 1, ///< \f${D=\alpha\mathcal{A}\mathcal{B}+\beta\mathcal{C}}\f$ + SCALE_COMPLEX = 2, + BILINEAR_COMPLEX = 3, UNKNOWN, }; diff --git a/library/src/contraction/contraction_types_impl.hpp b/library/src/contraction/contraction_types_impl.hpp index d8fa0f74..070718cc 100644 --- a/library/src/contraction/contraction_types_impl.hpp +++ b/library/src/contraction/contraction_types_impl.hpp @@ -32,6 +32,7 @@ #include #include +#include "device/device_element_wise_operation_complex.hpp" #include "contraction_types.hpp" #include @@ -51,12 +52,24 @@ namespace hiptensor static constexpr auto value = ContractionOpId_t::SCALE; }; + template <> + struct ContractionOperatorType + { + static constexpr auto value = ContractionOpId_t::SCALE_COMPLEX; + }; + template <> struct ContractionOperatorType { static constexpr auto value = ContractionOpId_t::BILINEAR; }; + template <> + struct ContractionOperatorType + { + static constexpr auto value = ContractionOpId_t::BILINEAR_COMPLEX; + }; + } // namespace hiptensor #endif // HIPTENSOR_CONTRACTION_TYPES_IMPL_HPP diff --git a/library/src/contraction/device/common.hpp b/library/src/contraction/device/common.hpp index f530b2e2..efd4866c 100644 --- a/library/src/contraction/device/common.hpp +++ b/library/src/contraction/device/common.hpp @@ -39,4 +39,6 @@ #include #include +#include "device_element_wise_operation_complex.hpp" + #endif // CONTRACTION_DEVICE_COMMON_HPP diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp index fc050e50..7fc09504 100644 --- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -43,7 +43,8 @@ namespace ck using hiptensor::DeviceDeleter; using hiptensor::elementSpaceFromLengthsAndStrides; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; + using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex; + using Bilinear = ck::tensor_operation::element_wise::Bilinear; // The following is a specialization class for bilinear contractions of complex types. // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of @@ -121,7 +122,7 @@ namespace ck HIP_vector_type, AElementwiseOperation, BElementwiseOperation, - Bilinear, + BilinearComplex, GemmSpec, NumGemmKPrefetchStage, BlockSize, @@ -164,12 +165,13 @@ namespace ck HIP_vector_type, AElementwiseOperation, BElementwiseOperation, - Bilinear, + BilinearComplex, HIP_vector_type> { // Complex device Op - using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; - using CDEElementwiseOperation = Bilinear; + using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; + using CDEElementwiseOperation = BilinearComplex; + using DecompCDEElementwiseOperation = Bilinear; // Complex types given through the interface using ComplexA = HIP_vector_type; @@ -212,7 +214,7 @@ namespace ck DecompE, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation, + DecompCDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, @@ -285,7 +287,7 @@ namespace ck const std::vector& e_ms_ns_strides, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CDEElementwiseOperation cde_element_op) + CDEElementwiseOperation cde_element_op) : element_op(cde_element_op) { // Take the incoming arguments, treat them as complex. @@ -299,8 +301,6 @@ namespace ck elementsE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); - element_op = cde_element_op; - mA_real.reset(nullptr); mA_imag.reset(nullptr); mB_real.reset(nullptr); @@ -371,16 +371,16 @@ namespace ck cde_element_op); }; - mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, CDEElementwiseOperation{1.0f, 1.0f}); + mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, DecompCDEElementwiseOperation{1.0f, 1.0f}); mArgs[1] = allocArgs(mE_real, mA_imag, mB_imag, mE_real, - CDEElementwiseOperation{-1.0f, + DecompCDEElementwiseOperation{-1.0f, 1.0f}); - mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, CDEElementwiseOperation{1.0f, 1.0f}); + mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, DecompCDEElementwiseOperation{1.0f, 1.0f}); mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, - CDEElementwiseOperation{1.0f , 1.0f}); + DecompCDEElementwiseOperation{1.0f , 1.0f}); // original /* TODO :Uncomment once done @@ -425,7 +425,7 @@ namespace ck DeviceArray mE_real; DeviceArray mE_imag; - CDEElementwiseOperation element_op{1.0f, 1.0f}; + CDEElementwiseOperation element_op; void* mE_grid; index_t elementsE; }; @@ -469,7 +469,7 @@ namespace ck hiptensor::mfma<<>>( arg.mE_real.get(), arg.mE_imag.get(), arg.mD_real.get(), arg.mD_imag.get(), ((ComplexE*)arg.mE_grid), arg.element_op.alpha_, arg.element_op.beta_, - arg.elementsE); + arg.elementsE); //hiptensor::pack<<>>( // arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); } diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp index 02e3834e..4601021e 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp @@ -47,9 +47,10 @@ namespace ck { namespace instance { - using F32 = float; - using CF32 = hipFloatComplex; - using CF32_Tuple = ck::Tuple; + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E @@ -63,7 +64,7 @@ namespace ck CF32, PassThrough, PassThrough, - Bilinear>; + BilinearComplex>; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance( @@ -76,7 +77,7 @@ namespace ck CF32, PassThrough, PassThrough, - Bilinear, + BilinearComplex, CF32>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp index 742d49a6..e3f60146 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F32 = float; - using CF32 = hipFloatComplex; - using CF32_Tuple = ck::Tuple; + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF32, PassThrough, PassThrough, - Bilinear>; + BilinearComplex>; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance( @@ -75,7 +76,7 @@ namespace ck CF32, PassThrough, PassThrough, - Bilinear, + BilinearComplex, CF32>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp index 0f6b19d1..c2fd7c84 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F32 = float; - using CF32 = hipFloatComplex; - using CF32_Tuple = ck::Tuple; + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF32, PassThrough, PassThrough, - Bilinear>; + BilinearComplex>; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance( @@ -75,7 +76,7 @@ namespace ck CF32, PassThrough, PassThrough, - Bilinear, + BilinearComplex, CF32>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp index 184aea57..8203a4e5 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F32 = float; - using CF32 = hipFloatComplex; - using CF32_Tuple = ck::Tuple; + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF32, PassThrough, PassThrough, - Bilinear>; + BilinearComplex>; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance( @@ -75,7 +76,7 @@ namespace ck CF32, PassThrough, PassThrough, - Bilinear, + BilinearComplex, CF32>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp index 5be10230..9d779671 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp @@ -47,9 +47,10 @@ namespace ck { namespace instance { - using F64 = double; - using CF64 = hipDoubleComplex; - using CF64_Tuple = ck::Tuple; + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E @@ -63,7 +64,7 @@ namespace ck CF64, PassThrough, PassThrough, - Bilinear>; + BilinearComplex>; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance( @@ -76,7 +77,7 @@ namespace ck CF64, PassThrough, PassThrough, - Bilinear, + BilinearComplex, CF64>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp index bf5c1667..4197dda2 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F64 = double; - using CF64 = hipDoubleComplex; - using CF64_Tuple = ck::Tuple; + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF64, PassThrough, PassThrough, - Bilinear>; + BilinearComplex>; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance( @@ -75,7 +76,7 @@ namespace ck CF64, PassThrough, PassThrough, - Bilinear, + BilinearComplex, CF64>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp index e07e603e..cc519368 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F64 = double; - using CF64 = hipDoubleComplex; - using CF64_Tuple = ck::Tuple; + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF64, PassThrough, PassThrough, - Bilinear>; + BilinearComplex>; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance( @@ -75,7 +76,7 @@ namespace ck CF64, PassThrough, PassThrough, - Bilinear, + BilinearComplex, CF64>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp index 3329307a..ff187398 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F64 = double; - using CF64 = hipDoubleComplex; - using CF64_Tuple = ck::Tuple; + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF64, PassThrough, PassThrough, - Bilinear>; + BilinearComplex>; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance( @@ -75,7 +76,7 @@ namespace ck CF64, PassThrough, PassThrough, - Bilinear, + BilinearComplex, CF64>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp index 43a9358c..47b84e2c 100644 --- a/library/src/contraction/device/device_contraction_scale_complex.hpp +++ b/library/src/contraction/device/device_contraction_scale_complex.hpp @@ -44,7 +44,9 @@ namespace ck using hiptensor::elementSpaceFromLengthsAndStrides; using Bilinear = ck::tensor_operation::element_wise::Bilinear; + using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex; using Scale = ck::tensor_operation::element_wise::Scale; + using ScaleComplex = ck::tensor_operation::element_wise::ScaleComplex; // The following is a specialization class for bilinear contractions of complex types. // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of @@ -121,7 +123,7 @@ namespace ck HIP_vector_type, AElementwiseOperation, BElementwiseOperation, - Scale, + ScaleComplex, GemmSpec, NumGemmKPrefetchStage, BlockSize, @@ -164,15 +166,17 @@ namespace ck HIP_vector_type, AElementwiseOperation, BElementwiseOperation, - Scale, + ScaleComplex, HIP_vector_type> { // Complex device Op using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; // CDE Operations - using ScaleCDEElementwiseOperation = Scale; - using BilinearCDEElementwiseOperation = Bilinear; + using ScaleCDEElementwiseOperation = ScaleComplex; + using DecompScaleCDEElementwiseOperation = Scale; + using BilinearCDEElementwiseOperation = BilinearComplex; + using DecompBilinearCDEElementwiseOperation = Bilinear; // Complex types given through the interface using ComplexA = HIP_vector_type; @@ -214,7 +218,7 @@ namespace ck DecompE, AElementwiseOperation, BElementwiseOperation, - ScaleCDEElementwiseOperation, + DecompScaleCDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, @@ -262,7 +266,7 @@ namespace ck DecompE, AElementwiseOperation, BElementwiseOperation, - BilinearCDEElementwiseOperation, + DecompBilinearCDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, @@ -336,7 +340,7 @@ namespace ck const std::vector& e_ms_ns_strides, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - ScaleCDEElementwiseOperation cde_element_op) + ScaleCDEElementwiseOperation cde_element_op) : element_op(cde_element_op) { // Take the incoming arguments, treat them as complex. @@ -348,8 +352,6 @@ namespace ck elementsE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); - element_op = cde_element_op; - mA_real.reset(nullptr); mA_imag.reset(nullptr); mB_real.reset(nullptr); @@ -449,21 +451,21 @@ namespace ck cde_element_op); }; - mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, ScaleCDEElementwiseOperation{1.0f}); + mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f}); mBilinearArgs[0] = allocBilinearArgs( mE_real, mA_imag, mB_imag, mE_real, - BilinearCDEElementwiseOperation{-1.0f, 1.0f}); + DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f}); - mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, ScaleCDEElementwiseOperation{1.0f}); + mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f}); mBilinearArgs[1] = allocBilinearArgs( mE_imag, mA_imag, mB_real, mE_imag, - BilinearCDEElementwiseOperation{1.0f, 1.0f}); + DecompBilinearCDEElementwiseOperation{1.0f, 1.0f}); // TODO UNCOMMENT WHEN DONE @@ -512,7 +514,7 @@ namespace ck DeviceArray mE_real; DeviceArray mE_imag; - ScaleCDEElementwiseOperation element_op{1.0}; + ScaleCDEElementwiseOperation element_op; void* mE_grid; index_t elementsE; }; diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp index 9e9c8f9a..3133f4cd 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp @@ -48,9 +48,10 @@ namespace ck namespace instance { - using F32 = float; - using CF32 = hipFloatComplex; - using Empty_Tuple = ck::Tuple<>; + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E @@ -64,7 +65,7 @@ namespace ck CF32, PassThrough, PassThrough, - Scale>; + ScaleComplex>; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance( std::vector>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp index b9183a21..b358be8a 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F32 = float; - using CF32 = hipFloatComplex; - using Empty_Tuple = ck::Tuple<>; + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF32, PassThrough, PassThrough, - Scale>; + ScaleComplex>; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance( @@ -75,7 +76,7 @@ namespace ck CF32, PassThrough, PassThrough, - Scale, + ScaleComplex, CF32>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp index 1f87031d..359a074a 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F32 = float; - using CF32 = hipFloatComplex; - using Empty_Tuple = ck::Tuple<>; + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF32, PassThrough, PassThrough, - Scale>; + ScaleComplex>; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance( @@ -75,7 +76,7 @@ namespace ck CF32, PassThrough, PassThrough, - Scale, + ScaleComplex, CF32>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp index ef7724e0..4cc8659d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F32 = float; - using CF32 = hipFloatComplex; - using Empty_Tuple = ck::Tuple<>; + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF32, PassThrough, PassThrough, - Scale>; + ScaleComplex>; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance( @@ -75,7 +76,7 @@ namespace ck CF32, PassThrough, PassThrough, - Scale, + ScaleComplex, CF32>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp index e22aab5f..1cac8ebb 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp @@ -48,9 +48,10 @@ namespace ck namespace instance { - using F64 = double; - using CF64 = hipDoubleComplex; - using Empty_Tuple = ck::Tuple<>; + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E @@ -64,7 +65,7 @@ namespace ck CF64, PassThrough, PassThrough, - Scale>; + ScaleComplex>; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance( @@ -77,7 +78,7 @@ namespace ck CF64, PassThrough, PassThrough, - Scale, + ScaleComplex, CF64>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp index 58ed790a..e60bbd61 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F64 = double; - using CF64 = hipDoubleComplex; - using Empty_Tuple = ck::Tuple<>; + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF64, PassThrough, PassThrough, - Scale>; + ScaleComplex>; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance( @@ -75,7 +76,7 @@ namespace ck CF64, PassThrough, PassThrough, - Scale, + ScaleComplex, CF64>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp index 562519f5..e44d24e1 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F64 = double; - using CF64 = hipDoubleComplex; - using Empty_Tuple = ck::Tuple<>; + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF64, PassThrough, PassThrough, - Scale>; + ScaleComplex>; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance( @@ -75,7 +76,7 @@ namespace ck CF64, PassThrough, PassThrough, - Scale, + ScaleComplex, CF64>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp index 724d89cf..dee9ce39 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp @@ -46,9 +46,10 @@ namespace ck { namespace instance { - using F64 = double; - using CF64 = hipDoubleComplex; - using Empty_Tuple = ck::Tuple<>; + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E @@ -62,7 +63,7 @@ namespace ck CF64, PassThrough, PassThrough, - Scale>; + ScaleComplex>; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance( @@ -75,7 +76,7 @@ namespace ck CF64, PassThrough, PassThrough, - Scale, + ScaleComplex, CF64>>>& instances) { add_device_operation_instances( diff --git a/library/src/contraction/device/device_element_wise_complex.hpp b/library/src/contraction/device/device_element_wise_operation_complex.hpp similarity index 87% rename from library/src/contraction/device/device_element_wise_complex.hpp rename to library/src/contraction/device/device_element_wise_operation_complex.hpp index 6dfd94e0..a01ced36 100644 --- a/library/src/contraction/device/device_element_wise_complex.hpp +++ b/library/src/contraction/device/device_element_wise_operation_complex.hpp @@ -24,8 +24,8 @@ * *******************************************************************************/ -#ifndef HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP -#define HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP +#ifndef HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP +#define HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP #include #include @@ -37,11 +37,6 @@ namespace element_wise { struct ScaleComplex : public Scale { - __host__ __device__ ScaleComplex(hipFloatComplex scale) : Scale(hipCrealf(scale)) - { - scale_ = hipComplexFloatToDouble(scale); - } - __host__ __device__ ScaleComplex(hipDoubleComplex scale) : Scale(hipCreal(scale)) { scale_ = scale; @@ -68,12 +63,6 @@ struct ScaleComplex : public Scale struct BilinearComplex : public Bilinear { - BilinearComplex(hipFloatComplex alpha, hipFloatComplex beta) : Bilinear(hipCrealf(alpha), hipCrealf(beta)) - { - alpha_ = hipComplexFloatToDouble(alpha); - beta_ = hipComplexFloatToDouble(beta); - } - BilinearComplex(hipDoubleComplex alpha, hipDoubleComplex beta) : Bilinear(hipCreal(alpha), hipCreal(beta)) { alpha_ = alpha; @@ -105,4 +94,4 @@ struct BilinearComplex : public Bilinear } // namespace tensor_operation } // namespace ck -#endif // HIPTENSOR_ELEMENT_WISE_COMPLEX_HPP +#endif // HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP diff --git a/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp index eac0f117..81d7edf5 100644 --- a/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp +++ b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp @@ -44,9 +44,11 @@ namespace ck using F64 = double; using CF64 = hipDoubleComplex; using CF64_Tuple = ck::Tuple; - + + using BilinearComplex = element_wise::BilinearComplex; + void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance( std::vector>>& instances); + BilinearComplex, + CF32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance( std::vector>>& instances); + BilinearComplex, + CF32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance( std::vector>>& instances); + BilinearComplex, + CF32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance( std::vector>>& instances); + BilinearComplex, + CF32>>>& instances); // double void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance( std::vector>>& instances); + BilinearComplex, + CF64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance( std::vector>>& instances); + BilinearComplex, + CF64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance( std::vector>>& instances); + BilinearComplex, + CF64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance( std::vector>>& instances); + BilinearComplex, + CF64>>>& instances); // Contraction + Bilinear template , ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear, - ComputeDataT>> + ck::tensor_operation::element_wise::BilinearComplex, + HIP_vector_type>> { using DeviceOp = DeviceContractionMultipleD< NumDimM, @@ -191,8 +193,8 @@ namespace ck HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear, - ComputeDataT>; + ck::tensor_operation::element_wise::BilinearComplex, + HIP_vector_type>; static auto GetInstances() { @@ -203,13 +205,13 @@ namespace ck { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance( op_ptrs); } } @@ -220,13 +222,13 @@ namespace ck { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance( op_ptrs); } } diff --git a/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp index fff9dca6..705ac6c0 100644 --- a/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp +++ b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp @@ -44,8 +44,10 @@ namespace ck using F64 = double; using CF64 = hipDoubleComplex; + using ScaleComplex = element_wise::ScaleComplex; + void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance( std::vector>>& instances); + ScaleComplex, + CF32>>>& instances); void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance( std::vector>>& instances); + ScaleComplex, + CF32>>>& instances); void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance( std::vector>>& instances); + ScaleComplex, + CF32>>>& instances); void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance( std::vector>>& instances); + ScaleComplex, + CF32>>>& instances); void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance( std::vector>>& instances); + ScaleComplex, + CF64>>>& instances); void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance( std::vector>>& instances); + ScaleComplex, + CF64>>>& instances); void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance( std::vector>>& instances); + ScaleComplex, + CF64>>>& instances); void - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance( std::vector>>& instances); + ScaleComplex, + CF64>>>& instances); // Contraction + Scale template , ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, - ComputeDataType>> + ck::tensor_operation::element_wise::ScaleComplex, + HIP_vector_type>> { using DeviceOp = DeviceContractionMultipleD< NumDimM, @@ -188,8 +190,8 @@ namespace ck HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale, - ComputeDataType>; + ck::tensor_operation::element_wise::ScaleComplex, + HIP_vector_type>; static auto GetInstances() { @@ -200,13 +202,13 @@ namespace ck { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance( op_ptrs); } } @@ -216,13 +218,13 @@ namespace ck { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance( op_ptrs); } } diff --git a/library/src/include/meta_traits.hpp b/library/src/include/meta_traits.hpp index 0e039cd6..2cd0d740 100644 --- a/library/src/include/meta_traits.hpp +++ b/library/src/include/meta_traits.hpp @@ -32,7 +32,7 @@ namespace hiptensor // Placeholder for building traits on any type T // Use partial or full specialization for any class. - template + template struct MetaTraits; } // namespace hiptensor From cf233d82a08ebb0618bf8ca178388d754a87acf6 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 21 Dec 2023 22:45:45 +0000 Subject: [PATCH 38/42] Add unit test of contraction with complex compute type - Fix issue in yaml_test - Fix a bug in hiptensorInitContractionDescriptor --- .../src/contraction/contraction_selection.cpp | 19 ++--- .../src/contraction/hiptensor_contraction.cpp | 12 ++- library/src/data_types.cpp | 79 +++++++------------ library/src/include/data_types.hpp | 42 +++++----- test/00_unit/yaml_test.cpp | 8 +- .../configs/bilinear_test_params.yaml | 14 ++-- .../configs/complex_bilinear_test_params.yaml | 23 ++---- .../configs/scale_test_params.yaml | 14 ++-- test/01_contraction/contraction_test.cpp | 56 +++++++------ .../contraction_test_params.hpp | 4 +- test/llvm/yaml_parser_config.cpp | 14 +++- 11 files changed, 133 insertions(+), 152 deletions(-) diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index b2e54d80..1f7b70a6 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -81,22 +81,17 @@ namespace hiptensor * ``` * Hence, the `alpha` and `bete` need to point to a ComputeData value */ - hipDoubleComplex alpha; - hipDoubleComplex beta; - if(computeType == HIPTENSOR_COMPUTE_C32F) + ScalarData alpha; + ScalarData beta; + if(computeType == HIPTENSOR_COMPUTE_C32F || computeType == HIPTENSOR_COMPUTE_C64F) { - writeVal(&alpha, computeType, hipFloatComplex{1.02, 1.03}); - writeVal(&beta, computeType, hipFloatComplex{1.04, 1.05}); - } - else if(computeType == HIPTENSOR_COMPUTE_C64F) - { - writeVal(&alpha, computeType, hipDoubleComplex{1.02, 1.03}); - writeVal(&beta, computeType, hipDoubleComplex{1.04, 1.05}); + writeVal(&alpha, computeType, {computeType, 1.02, 1.03}); + writeVal(&beta, computeType, {computeType, 1.04, 1.05}); } else { - writeVal(&alpha, computeType, 1.02); - writeVal(&beta, computeType, 1.03); + writeVal(&alpha, computeType, ScalarData(computeType, 1.02)); + writeVal(&beta, computeType, ScalarData(computeType, 1.03)); } CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA)); diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index 8148eeaa..d063ebf5 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -147,7 +147,11 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t* // Use a scale contraction due to // tensor C-descriptor is empty - *desc = {(int32_t)hiptensor::ContractionOpId_t::SCALE, + auto contractionOp + = typeCompute == HIPTENSOR_COMPUTE_C32F || typeCompute == HIPTENSOR_COMPUTE_C64F + ? hiptensor::ContractionOpId_t::SCALE_COMPLEX + : hiptensor::ContractionOpId_t::SCALE; + *desc = {(int32_t)contractionOp, typeCompute, {*descA, *descB, @@ -161,7 +165,11 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t* { // Use a bilinear contraction due to // tensor C-descriptor is not empty - *desc = {(int32_t)hiptensor::ContractionOpId_t::BILINEAR, + auto contractionOp + = typeCompute == HIPTENSOR_COMPUTE_C32F || typeCompute == HIPTENSOR_COMPUTE_C64F + ? hiptensor::ContractionOpId_t::BILINEAR_COMPLEX + : hiptensor::ContractionOpId_t::BILINEAR; + *desc = {(int32_t)contractionOp, typeCompute, {*descA, *descB, *descC, *descD}, {alignmentRequirementA, diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp index abaf7154..69e29b50 100644 --- a/library/src/data_types.cpp +++ b/library/src/data_types.cpp @@ -153,130 +153,105 @@ namespace hiptensor { if(id == HIPTENSOR_COMPUTE_16F) { - return ScalarData(*(_Float16*)value, id); + return ScalarData(id, *(_Float16*)value); } else if(id == HIPTENSOR_COMPUTE_16BF) { - return ScalarData(*(hip_bfloat16*)value, id); + return ScalarData(id, *(hip_bfloat16*)value); } else if(id == HIPTENSOR_COMPUTE_32F) { - return ScalarData(*(float*)value, id); + return ScalarData(id, *(float*)value); } else if(id == HIPTENSOR_COMPUTE_64F) { - return ScalarData(*(double*)value, id); + return ScalarData(id, *(double*)value); } else if(id == HIPTENSOR_COMPUTE_8U) { - return ScalarData(*(uint8_t*)value, id); + return ScalarData(id, *(uint8_t*)value); } else if(id == HIPTENSOR_COMPUTE_8I) { - return ScalarData(*(int8_t*)value, id); + return ScalarData(id, *(int8_t*)value); } else if(id == HIPTENSOR_COMPUTE_32U) { - return ScalarData(*(uint32_t*)value, id); + return ScalarData(id, *(uint32_t*)value); } else if(id == HIPTENSOR_COMPUTE_32I) { - return ScalarData(*(int32_t*)value, id); + return ScalarData(id, *(int32_t*)value); } else if(id == HIPTENSOR_COMPUTE_C32F) { - return {*(hipFloatComplex*)value, id}; + auto complex = *(hipFloatComplex*)value; + return {id, complex.x, complex.y}; } else if(id == HIPTENSOR_COMPUTE_C64F) { - return {*(hipDoubleComplex*)value, id}; + auto complex = *(hipDoubleComplex*)value; + return {id, complex.x, complex.y}; } else { #if !NDEBUG std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; #endif // !NDEBUG - return {0, HIPTENSOR_COMPUTE_NONE}; + return {HIPTENSOR_COMPUTE_NONE, 0, 0}; } } - void writeVal(void const* addr, hiptensorComputeType_t id, double value) + void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value) { if(id == HIPTENSOR_COMPUTE_16F) { - *(_Float16*)addr = value; + *(_Float16*)addr = value.mReal; } else if(id == HIPTENSOR_COMPUTE_16BF) { - *(hip_bfloat16*)addr = value; + *(hip_bfloat16*)addr = value.mReal; } else if(id == HIPTENSOR_COMPUTE_32F) { - *(float*)addr = value; + *(float*)addr = value.mReal; } else if(id == HIPTENSOR_COMPUTE_64F) { - *(double*)addr = value; + *(double*)addr = value.mReal; } else if(id == HIPTENSOR_COMPUTE_8U) { - *(uint8_t*)addr = value; + *(uint8_t*)addr = (uint8_t)value.mReal; } else if(id == HIPTENSOR_COMPUTE_8I) { - *(int8_t*)addr = value; + *(int8_t*)addr = (int8_t)value.mReal; } else if(id == HIPTENSOR_COMPUTE_32U) { - *(uint32_t*)addr = value; + *(uint32_t*)addr = (uint32_t)value.mReal; } else if(id == HIPTENSOR_COMPUTE_32I) { - *(int32_t*)addr = value; + *(int32_t*)addr = (int32_t)value.mReal; } - else - { -#if !NDEBUG - std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; -#endif // !NDEBUG - return; - } - } - - void writeVal(void const* addr, hiptensorComputeType_t id, hipFloatComplex value) - { - if(id == HIPTENSOR_COMPUTE_C32F) - { - *(hipFloatComplex*)addr = value; - } - else + else if(id == HIPTENSOR_COMPUTE_C32F) { -#if !NDEBUG - std::cout << "Data type is hipFloatComplex, but hiptensorComputeType_t is not " - "HIPTENSOR_COMPUTE_C32F: " - << id << std::endl; -#endif // !NDEBUG - return; + *(hipFloatComplex*)addr = hipComplexDoubleToFloat(value.mComplex); } - } - - void writeVal(void const* addr, hiptensorComputeType_t id, hipDoubleComplex value) - { - if(id == HIPTENSOR_COMPUTE_C64F) + else if(id == HIPTENSOR_COMPUTE_C64F) { - *(hipDoubleComplex*)addr = value; + *(hipDoubleComplex*)addr = value.mComplex; } else { #if !NDEBUG - std::cout << "Data type is hipDoubleComplex, but hiptensorComputeType_t is not " - "HIPTENSOR_COMPUTE_C64F: " - << id << std::endl; + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; #endif // !NDEBUG return; } } - } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType) diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp index aa2eaa40..900b2069 100644 --- a/library/src/include/data_types.hpp +++ b/library/src/include/data_types.hpp @@ -46,44 +46,41 @@ namespace hiptensor struct ScalarData { - hiptensorComputeType_t type; + hiptensorComputeType_t mType; union { - double real; - hipDoubleComplex complex; + double mReal; + hipDoubleComplex mComplex; }; ScalarData() = default; - ScalarData(double value, hiptensorComputeType_t type) - : real(value) - , type(type) - { - } - ScalarData(hipFloatComplex value, hiptensorComputeType_t type) - : complex(hipComplexFloatToDouble(value)) - , type(type) - { - } - ScalarData(hipDoubleComplex value, hiptensorComputeType_t type) - : complex(value) - , type(type) + ScalarData(hiptensorComputeType_t type, double real, double imag = 0) { + mType = type; + if(type == HIPTENSOR_COMPUTE_C32F || type == HIPTENSOR_COMPUTE_C64F) + { + mComplex = make_hipDoubleComplex(real, imag); + } + else + { + mReal = real; + } } operator float() const { - return static_cast(real); + return static_cast(mReal); } operator double() const { - return real; + return mReal; } operator hipFloatComplex() const { - return hipComplexDoubleToFloat(complex); + return hipComplexDoubleToFloat(mComplex); } operator hipDoubleComplex() const { - return complex; + return mComplex; } }; @@ -109,10 +106,7 @@ namespace hiptensor template T readVal(void const* value, hiptensorComputeType_t id); - void writeVal(void const* addr, hiptensorComputeType_t id, double value); - void writeVal(void const* addr, hiptensorComputeType_t id, hipDoubleComplex value); - void writeVal(void const* addr, hiptensorComputeType_t id, hipFloatComplex value); - + void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value); } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType); diff --git a/test/00_unit/yaml_test.cpp b/test/00_unit/yaml_test.cpp index 372fbbdd..57a86a25 100644 --- a/test/00_unit/yaml_test.cpp +++ b/test/00_unit/yaml_test.cpp @@ -54,8 +54,8 @@ namespace hiptensor using LengthsT = std::vector; using StridesT = std::vector; - using AlphaT = double; - using BetaT = double; + using AlphaT = std::vector; + using BetaT = std::vector; //Data types of input and output tensors std::vector mDataTypes; @@ -98,8 +98,8 @@ int main(int argc, char* argv[]) yee.mProblemLengths = {{5, 6, 7, 8, 4, 2, 3, 4}, {1, 2, 3, 4}, {99, 12, 44, 31, 59, 23, 54, 22}}; yee.mProblemStrides = {{}}; - yee.mAlphas = {0, 1, 1}; - yee.mBetas = {2, 2, 2}; + yee.mAlphas = {{0}, {1}, {1}}; + yee.mBetas = {{2}, {2}, {2}}; struct TmpFileWrapper { diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index cbaee86a..1e7999fc 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -8,8 +8,6 @@ Tensor Data Types: - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ] - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ] - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ] - - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_R_32F ] - - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_R_64F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT @@ -21,13 +19,13 @@ Worksize Prefs: - HIPTENSOR_WORKSPACE_MIN - HIPTENSOR_WORKSPACE_MAX Alphas: - - 0 - - 1 - - 1 + - [0] + - [1] + - [1] Betas: - - 2 - - 0 - - 2 + - [2] + - [0] + - [2] Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml index cbaee86a..0d59c05d 100644 --- a/test/01_contraction/configs/complex_bilinear_test_params.yaml +++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml @@ -1,15 +1,8 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: - - [ HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_32F ] - - [ HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_32F ] - - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F ] - - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16F ] - - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ] - - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ] - - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ] - - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_R_32F ] - - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_R_64F ] + - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F ] + - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT @@ -21,13 +14,13 @@ Worksize Prefs: - HIPTENSOR_WORKSPACE_MIN - HIPTENSOR_WORKSPACE_MAX Alphas: - - 0 - - 1 - - 1 + - [0, 0] + - [1, 1] + - [1, 1] Betas: - - 2 - - 0 - - 2 + - [2, 2] + - [0, 0] + - [2, 2] Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index 4e640034..bc8289f5 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -8,8 +8,6 @@ Tensor Data Types: - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] - - [ HIP_C_32F, HIP_C_32F, NONE_TYPE, HIP_C_32F, HIP_R_32F ] - - [ HIP_C_64F, HIP_C_64F, NONE_TYPE, HIP_C_64F, HIP_R_64F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT @@ -21,13 +19,13 @@ Worksize Prefs: - HIPTENSOR_WORKSPACE_MIN - HIPTENSOR_WORKSPACE_MAX Alphas: - - 0 - - 1 - - 1 + - [0] + - [1] + - [1] Betas: - - 2 - - 0 - - 2 + - [2] + - [0] + - [2] Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index 2059fd73..a75cf7bf 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -57,7 +57,8 @@ namespace hiptensor bool ContractionTest::checkDevice(hipDataType datatype) const { return (isF32Supported() - && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF || datatype == HIP_C_32F)) + && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF + || datatype == HIP_C_32F)) || (isF64Supported() && (datatype == HIP_R_64F || datatype == HIP_C_64F)); } @@ -131,7 +132,8 @@ namespace hiptensor || (DDataType == HIP_C_32F) || (DDataType == HIP_C_64F)); EXPECT_TRUE( (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF) - || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F)); + || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F) + || (computeType == HIPTENSOR_COMPUTE_C32F) || (computeType == HIPTENSOR_COMPUTE_C64F)); mRunFlag &= checkDevice(DDataType); @@ -297,28 +299,36 @@ namespace hiptensor else if(ADataType == HIP_C_32F && BDataType == HIP_C_32F && DDataType == HIP_C_32F) { // Initialize matrix data on device - fillLaunchKernel((hipFloatComplex*)resource->deviceA().get(), elementsA); - fillLaunchKernel((hipFloatComplex*)resource->deviceB().get(), elementsB); + fillLaunchKernel((hipFloatComplex*)resource->deviceA().get(), + elementsA); + fillLaunchKernel((hipFloatComplex*)resource->deviceB().get(), + elementsB); if(CDataType == HIP_C_32F) { - fillLaunchKernel((hipFloatComplex*)resource->deviceC().get(), elementsCD); + fillLaunchKernel((hipFloatComplex*)resource->deviceC().get(), + elementsCD); } - fillValLaunchKernel((hipFloatComplex*)resource->deviceD().get(), - elementsCD, - std::numeric_limits::signaling_NaN()); + fillValLaunchKernel( + (hipFloatComplex*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); } - else if(ADataType == HIP_C_64F && BDataType == HIP_C_64F && DDataType == HIP_C_64F) + else if(ADataType == HIP_C_64F && BDataType == HIP_C_64F && DDataType == HIP_C_64F) { // Initialize matrix data on device - fillLaunchKernel((hipDoubleComplex*)resource->deviceA().get(), elementsA); - fillLaunchKernel((hipDoubleComplex*)resource->deviceB().get(), elementsB); + fillLaunchKernel((hipDoubleComplex*)resource->deviceA().get(), + elementsA); + fillLaunchKernel((hipDoubleComplex*)resource->deviceB().get(), + elementsB); if(CDataType == HIP_C_64F) { - fillLaunchKernel((hipDoubleComplex*)resource->deviceC().get(), elementsCD); + fillLaunchKernel((hipDoubleComplex*)resource->deviceC().get(), + elementsCD); } - fillValLaunchKernel((hipDoubleComplex*)resource->deviceD().get(), - elementsCD, - std::numeric_limits::signaling_NaN()); + fillValLaunchKernel( + (hipDoubleComplex*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); } resource->copyDeviceToHostAll(elementBytes); @@ -515,7 +525,8 @@ namespace hiptensor stream << std::endl; stream << "Tensor D elements:\n"; - hiptensorPrintArrayElements(stream, (hipFloatComplex*)D.get(), elementsCD); + hiptensorPrintArrayElements( + stream, (hipFloatComplex*)D.get(), elementsCD); stream << std::endl; } else if(DDataType == HIP_C_64F) @@ -536,7 +547,8 @@ namespace hiptensor stream << std::endl; stream << "Tensor D elements:\n"; - hiptensorPrintArrayElements(stream, (hipDoubleComplex*)D.get(), elementsCD); + hiptensorPrintArrayElements( + stream, (hipDoubleComplex*)D.get(), elementsCD); stream << std::endl; } } @@ -573,10 +585,10 @@ namespace hiptensor * ``` * Hence, the `alpha` and `bete` need to point to a ComputeData value */ - double alphaBuf = 0.; - double betaBuf = 0.; - writeVal(&alphaBuf, computeType, alpha); - writeVal(&betaBuf, computeType, beta); + ScalarData alphaBuf; + ScalarData betaBuf; + writeVal(&alphaBuf, computeType, ScalarData(computeType, alpha[0], alpha[1])); + writeVal(&betaBuf, computeType, ScalarData(computeType, beta[0], beta[1])); CHECK_HIPTENSOR_ERROR( hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); @@ -643,7 +655,7 @@ namespace hiptensor std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD); } - else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F) + else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F) { std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( (double*)resource->deviceD().get(), (double*)reference.get(), elementsCD); diff --git a/test/01_contraction/contraction_test_params.hpp b/test/01_contraction/contraction_test_params.hpp index 29c4aa1b..4db4ebc1 100644 --- a/test/01_contraction/contraction_test_params.hpp +++ b/test/01_contraction/contraction_test_params.hpp @@ -49,8 +49,8 @@ namespace hiptensor using LengthsT = std::vector; using StridesT = std::vector; - using AlphaT = double; - using BetaT = double; + using AlphaT = std::vector; + using BetaT = std::vector; public: std::vector& dataTypes() diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp index 5c674045..8b504b01 100644 --- a/test/llvm/yaml_parser_config.cpp +++ b/test/llvm/yaml_parser_config.cpp @@ -92,6 +92,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(hiptensorOperator_t) LLVM_YAML_IS_SEQUENCE_VECTOR(hiptensorWorksizePreference_t) LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector) LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector) +LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector) LLVM_YAML_IS_SEQUENCE_VECTOR(AlphaT) LLVM_YAML_IS_SEQUENCE_VECTOR(BetaT) @@ -229,10 +230,10 @@ namespace llvm io.mapRequired("Algorithm Types", doc.algorithms()); io.mapRequired("Operators", doc.operators()); io.mapRequired("Worksize Prefs", doc.workSizePrefrences()); - io.mapRequired("Alphas", (std::vector&)(doc.alphas())); + io.mapOptional("Alphas", (std::vector>&)(doc.alphas())); io.mapOptional("Betas", - (std::vector&)(doc.betas()), - std::vector(doc.alphas().size(), BetaT(0))); + (std::vector>&)(doc.betas()), + std::vector>(doc.alphas().size())); io.mapRequired("Lengths", doc.problemLengths()); // Default values for optional values @@ -259,6 +260,13 @@ namespace llvm return "Error: Empty Alphas"; } + if(std::any_of(doc.alphas().cbegin(), doc.alphas().cend(), [](auto&& alpha) { + return alpha.size() > 2 || alpha.size() <= 0; + })) + { + return "Error: invalid Alpha"; + } + if(doc.betas().size() > 0 && doc.betas().size() != doc.alphas().size()) { return "Error: Alphas and betas must have same size"; From c01eda7cadbfc790370440cd7413928b01af317a Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Fri, 22 Dec 2023 12:06:21 -0500 Subject: [PATCH 39/42] Modify samples to use new compute type - Fix bug in samples - Add unit tests for scale contraction --- samples/01_contraction/CMakeLists.txt | 4 +- .../simple_bilinear_contraction.hpp | 15 ++++-- ...tion_cf32_cf32_cf32_cf32_compute_cf32.cpp} | 6 +-- .../simple_scale_contraction.hpp | 13 ++++- ...ntraction_cf32_cf32_cf32_compute_cf32.cpp} | 6 +-- test/01_contraction/CMakeLists.txt | 6 ++- .../complex_scale_contraction_test.cpp | 48 +++++++++++++++++++ .../configs/complex_scale_test_params.yaml | 30 ++++++++++++ 8 files changed, 114 insertions(+), 14 deletions(-) rename samples/01_contraction/{simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp => simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp} (95%) rename samples/01_contraction/{simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp => simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp} (95%) create mode 100644 test/01_contraction/complex_scale_contraction_test.cpp create mode 100644 test/01_contraction/configs/complex_scale_test_params.yaml diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index c51a2dbc..d255c0e4 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -31,7 +31,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) - add_hiptensor_sample(simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32 simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32 simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp) add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) @@ -39,7 +39,7 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) - add_hiptensor_sample(simple_scale_contraction_cf32_cf32_cf32_compute_f32 simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_cf32_cf32_cf32_compute_cf32 simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp) add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp index 27001232..b0348b91 100644 --- a/samples/01_contraction/simple_bilinear_contraction.hpp +++ b/samples/01_contraction/simple_bilinear_contraction.hpp @@ -37,15 +37,24 @@ template int bilinearContractionSample() { - floatTypeCompute alpha = (floatTypeCompute)1.0f; - floatTypeCompute beta = (floatTypeCompute)1.0f; + computeDataType alpha, beta; + if constexpr(std::is_same_v || std::is_same_v) + { + alpha = computeDataType(1.0, 1.0); + beta = computeDataType(1.0, 1.0); + } + else + { + alpha = (computeDataType)1.0f; + beta = (computeDataType)1.0f; + } /********************** * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * diff --git a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp similarity index 95% rename from samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp rename to samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp index 25392592..648675f6 100644 --- a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_f32.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp @@ -39,17 +39,17 @@ int main(int argc, char* argv[]) typedef hipFloatComplex ADataType; typedef hipFloatComplex BDataType; typedef hipFloatComplex CDataType; - typedef float floatTypeCompute; + typedef hipFloatComplex ComputeDataType; constexpr hipDataType typeA = HIP_C_32F; constexpr hipDataType typeB = HIP_C_32F; constexpr hipDataType typeC = HIP_C_32F; - constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F; return bilinearContractionSample int scaleContractionSample() { - floatTypeCompute alpha = (floatTypeCompute)1.0f; + computeDataType alpha; + if constexpr(std::is_same_v || std::is_same_v) + { + alpha = computeDataType(1.0, 1.0); + } + else + { + alpha = (computeDataType)1.0f; + } + /********************** * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} **********************/ diff --git a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp similarity index 95% rename from samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp rename to samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp index 7fc5c3a3..0f6eaac3 100644 --- a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_f32.cpp +++ b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp @@ -39,17 +39,17 @@ int main(int argc, char* argv[]) typedef hipFloatComplex ADataType; typedef hipFloatComplex BDataType; typedef hipFloatComplex DDataType; - typedef float floatTypeCompute; + typedef hipFloatComplex ComputeDataType; constexpr hipDataType typeA = HIP_C_32F; constexpr hipDataType typeB = HIP_C_32F; constexpr hipDataType typeD = HIP_C_32F; - constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F; return scaleContractionSample +#include + +#include "contraction_test.hpp" +#include "contraction_test_helpers.hpp" + +class ComplexScaleContractionTest : public hiptensor::ContractionTest +{ +}; + +TEST_P(ComplexScaleContractionTest, RunKernel) +{ + static bool ranWarmup = false; + if(!ranWarmup) + { + this->Warmup(); + ranWarmup = true; + } + this->RunKernel(); +} + +INSTANTIATE_TEST_SUITE_P(ContractionTests, ComplexScaleContractionTest, load_config_helper()); diff --git a/test/01_contraction/configs/complex_scale_test_params.yaml b/test/01_contraction/configs/complex_scale_test_params.yaml new file mode 100644 index 00000000..89f9e736 --- /dev/null +++ b/test/01_contraction/configs/complex_scale_test_params.yaml @@ -0,0 +1,30 @@ +--- +Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] +Tensor Data Types: + - [ HIP_C_32F, HIP_C_32F, NONE_TYPE, HIP_C_32F, HIP_C_32F ] + - [ HIP_C_64F, HIP_C_64F, NONE_TYPE, HIP_C_64F, HIP_C_64F ] +Algorithm Types: + - HIPTENSOR_ALGO_DEFAULT + - HIPTENSOR_ALGO_DEFAULT_PATIENT + # - HIPTENSOR_ALGO_ACTOR_CRITIC +Operators: + - HIPTENSOR_OP_IDENTITY +Worksize Prefs: + - HIPTENSOR_WORKSPACE_RECOMMENDED + - HIPTENSOR_WORKSPACE_MIN + - HIPTENSOR_WORKSPACE_MAX +Alphas: + - [0, 0] + - [1, 1] + - [1, 1] +Betas: + - [2, 2] + - [0, 0] + - [2, 2] +Lengths: + - [ 5, 6, 3, 4, 3, 4 ] + - [ 4, 3, 4, 3, 6, 5 ] + - [ 24, 18, 2, 4, 9, 2 ] +Strides: + - [] +... From 23c033fcbd20e2268bbe0f1e708cb197481f6773 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Fri, 22 Dec 2023 17:27:11 +0000 Subject: [PATCH 40/42] Support Complex alpha and beta in samples - Add complex_scale_contraction_test.cpp - Fixed bug in device_contraction_bilinear_complex.hpp --- .../device_contraction_bilinear_complex.hpp | 290 ++++++++++++------ .../device_contraction_scale_complex.hpp | 95 +++--- .../src/contraction/hiptensor_contraction.cpp | 11 +- library/src/data_types.cpp | 16 + library/src/include/data_types.hpp | 5 + .../simple_bilinear_contraction.hpp | 28 +- ...ction_bf16_bf16_bf16_bf16_compute_bf16.cpp | 5 +- ...ction_cf32_cf32_cf32_cf32_compute_cf32.cpp | 7 +- ...ontraction_f16_f16_f16_f16_compute_f16.cpp | 5 +- ...ntraction_f32_f32_f32_f32_compute_bf16.cpp | 5 +- ...ontraction_f32_f32_f32_f32_compute_f16.cpp | 5 +- ...ontraction_f32_f32_f32_f32_compute_f32.cpp | 5 +- ...ontraction_f64_f64_f64_f64_compute_f32.cpp | 5 +- ...ontraction_f64_f64_f64_f64_compute_f64.cpp | 5 +- .../simple_scale_contraction.hpp | 15 +- ...ontraction_bf16_bf16_bf16_compute_bf16.cpp | 4 +- ...ontraction_cf32_cf32_cf32_compute_cf32.cpp | 6 +- ...le_contraction_f16_f16_f16_compute_f16.cpp | 4 +- ...e_contraction_f32_f32_f32_compute_bf16.cpp | 4 +- ...le_contraction_f32_f32_f32_compute_f16.cpp | 4 +- ...le_contraction_f32_f32_f32_compute_f32.cpp | 4 +- ...le_contraction_f64_f64_f64_compute_f32.cpp | 4 +- ...le_contraction_f64_f64_f64_compute_f64.cpp | 4 +- .../configs/complex_bilinear_test_params.yaml | 4 +- .../configs/complex_scale_test_params.yaml | 4 +- test/device/common.hpp | 2 +- 26 files changed, 326 insertions(+), 220 deletions(-) diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp index 7fc09504..712ff3b0 100644 --- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -43,8 +43,10 @@ namespace ck using hiptensor::DeviceDeleter; using hiptensor::elementSpaceFromLengthsAndStrides; - using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex; using Bilinear = ck::tensor_operation::element_wise::Bilinear; + using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex; + using Scale = ck::tensor_operation::element_wise::Scale; + using ScaleComplex = ck::tensor_operation::element_wise::ScaleComplex; // The following is a specialization class for bilinear contractions of complex types. // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of @@ -169,9 +171,13 @@ namespace ck HIP_vector_type> { // Complex device Op - using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; - using CDEElementwiseOperation = BilinearComplex; - using DecompCDEElementwiseOperation = Bilinear; + using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; + + // CDE Operations + using ScaleCDEElementwiseOperation = ScaleComplex; + using DecompScaleCDEElementwiseOperation = Scale; + using BilinearCDEElementwiseOperation = BilinearComplex; + using DecompBilinearCDEElementwiseOperation = Bilinear; // Complex types given through the interface using ComplexA = HIP_vector_type; @@ -202,7 +208,55 @@ namespace ck // The internal operation that we will decompose the complex operations with. // For complex will be either float or double - using DecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + using ScaleDecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + DecompA, + DecompB, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + DecompE, + AElementwiseOperation, + BElementwiseOperation, + DecompScaleCDEElementwiseOperation, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + DecompCompute, + LoopSched>; + + // The internal operation that we will decompose the complex operations with. + // For complex will be either float or double + using BilinearDecompOp = DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK, @@ -214,7 +268,7 @@ namespace ck DecompE, AElementwiseOperation, BElementwiseOperation, - DecompCDEElementwiseOperation, + DecompBilinearCDEElementwiseOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, @@ -251,13 +305,14 @@ namespace ck // Argument struct Argument : public BaseArgument { - using DecompArg = typename DecompOp::Argument; + using ScaleDecompArgument = typename ScaleDecompOp::Argument; + using BilinearDecompArgument = typename BilinearDecompOp::Argument; Argument(Argument&& other) - : mArgs({std::move(other.mArgs[0]), - std::move(other.mArgs[1]), - std::move(other.mArgs[2]), - std::move(other.mArgs[3])}) + : mScaleArgs( + {std::move(other.mScaleArgs[0]), std::move(other.mScaleArgs[1])}) + , mBilinearArgs({std::move(other.mBilinearArgs[0]), + std::move(other.mBilinearArgs[1])}) { } @@ -265,10 +320,10 @@ namespace ck { if(this != &other) { - mArgs[0] = std::move(other.mArgs[0]); - mArgs[1] = std::move(other.mArgs[1]); - mArgs[2] = std::move(other.mArgs[2]); - mArgs[3] = std::move(other.mArgs[3]); + mScaleArgs[0] = std::move(other.mScaleArgs[0]); + mScaleArgs[1] = std::move(other.mScaleArgs[1]); + mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); + mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); } return *this; } @@ -287,7 +342,8 @@ namespace ck const std::vector& e_ms_ns_strides, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CDEElementwiseOperation cde_element_op) : element_op(cde_element_op) + BilinearCDEElementwiseOperation cde_element_op) + : element_op(cde_element_op) { // Take the incoming arguments, treat them as complex. @@ -310,7 +366,7 @@ namespace ck mE_real.reset(nullptr); mE_imag.reset(nullptr); - mE_grid = p_e_grid; + mE_grid = p_e_grid; auto blockDim = dim3(1024); auto decompGrid = [blockDim](auto& out_r, @@ -334,36 +390,34 @@ namespace ck } }; + // Decompose the incoming data from AOS->SOA decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA); decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB); decompGrid(mD_real, mD_imag, (const ComplexDs*)p_ds_grid[0], elementsD); decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE); - auto allocArgs = [a_ms_ks_lengths, - a_ms_ks_strides, - b_ns_ks_lengths, - b_ns_ks_strides, - ds_ms_ns_lengths, - ds_ms_ns_strides, - e_ms_ns_lengths, - e_ms_ns_strides, - a_element_op, - b_element_op](auto& out_e, - auto const& in_a, - auto const& in_b, - auto const& in_d, - auto const& cde_element_op) { - return std::make_unique( + auto allocScaleArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& cde_element_op) { + return std::make_unique( in_a.get(), in_b.get(), - std::array{in_d.get()}, + std::array{}, out_e.get(), a_ms_ks_lengths, a_ms_ks_strides, b_ns_ks_lengths, b_ns_ks_strides, - ds_ms_ns_lengths, - ds_ms_ns_strides, + std::array, 0>{}, + std::array, 0>{}, e_ms_ns_lengths, e_ms_ns_strides, a_element_op, @@ -371,46 +425,88 @@ namespace ck cde_element_op); }; - mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, DecompCDEElementwiseOperation{1.0f, 1.0f}); - mArgs[1] = allocArgs(mE_real, - mA_imag, - mB_imag, - mE_real, - DecompCDEElementwiseOperation{-1.0f, - 1.0f}); - mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, DecompCDEElementwiseOperation{1.0f, 1.0f}); - mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, - DecompCDEElementwiseOperation{1.0f , 1.0f}); + auto allocBilinearArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& in_d, + auto const& cde_element_op) { + return std::make_unique( + in_a.get(), + in_b.get(), + std::array{in_d.get()}, + out_e.get(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{e_ms_ns_lengths}, + std::array, 1>{e_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + }; + mScaleArgs[0] = allocScaleArgs( + mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[0] + = allocBilinearArgs(mE_real, + mA_imag, + mB_imag, + mE_real, + DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f}); + + mScaleArgs[1] = allocScaleArgs( + mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[1] + = allocBilinearArgs(mE_imag, + mA_imag, + mB_real, + mE_imag, + DecompBilinearCDEElementwiseOperation{1.0f, 1.0f}); + + // TODO UNCOMMENT WHEN DONE // original - /* TODO :Uncomment once done - mArgs[0] = allocArgs(mE_real, mA_real, mB_real, mD_real, cde_element_op); - mArgs[1] = allocArgs(mE_real, - mA_imag, - mB_imag, - mE_real, - CDEElementwiseOperation{cde_element_op.alpha_ * -1.0f, - 1.0f}); - mArgs[2] = allocArgs(mE_imag, mA_real, mB_imag, mD_imag, cde_element_op); - mArgs[3] = allocArgs(mE_imag, mA_imag, mB_real, mE_imag, - CDEElementwiseOperation{cde_element_op.alpha_ , 1.0f});*/ + /*mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op); + mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op); + mBilinearArgs[0] = allocBilinearArgs( + mE_real, + mA_imag, + mB_imag, + mE_real, + BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f}); + mBilinearArgs[1] = allocBilinearArgs( + mE_imag, + mA_imag, + mB_real, + mE_imag, + BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});*/ } void Print() const { - std::cout << "Args0:" << std::endl; - mArgs[0]->Print(); - std::cout << "Args1:" << std::endl; - mArgs[1]->Print(); - std::cout << "Args2:" << std::endl; - mArgs[2]->Print(); - std::cout << "Args3:" << std::endl; - mArgs[3]->Print(); + std::cout << "ScaleArgs0:" << std::endl; + mScaleArgs[0]->Print(); + std::cout << "ScaleArgs1:" << std::endl; + mScaleArgs[1]->Print(); + std::cout << "BilinearArgs0:" << std::endl; + mBilinearArgs[0]->Print(); + std::cout << "BilinearArgs1:" << std::endl; + mBilinearArgs[1]->Print(); } // private: // Each argument set for complex: - std::unique_ptr mArgs[4]; + std::unique_ptr mScaleArgs[2]; + std::unique_ptr mBilinearArgs[2]; template using DeviceArray = std::unique_ptr; @@ -425,9 +521,9 @@ namespace ck DeviceArray mE_real; DeviceArray mE_imag; - CDEElementwiseOperation element_op; - void* mE_grid; - index_t elementsE; + BilinearCDEElementwiseOperation element_op; + void* mE_grid; + index_t elementsE; }; // Invoker @@ -436,12 +532,14 @@ namespace ck using Argument = typename DeviceOp::Argument; Invoker() - : mInvoker(std::make_unique()) + : mScaleInvoker(std::make_unique()) + , mBilinearInvoker(std::make_unique()) { } Invoker(Invoker&& other) - : mInvoker(std::move(other.mInvoker)) + : mScaleInvoker(std::move(other.mScaleInvoker)) + , mBilinearInvoker(std::move(other.mBilinearInvoker)) { } @@ -449,7 +547,8 @@ namespace ck { if(this != &other) { - mInvoker = std::move(other.mInvoker); + mScaleInvoker = std::move(other.mScaleInvoker); + mBilinearInvoker = std::move(other.mBilinearInvoker); } return *this; } @@ -457,19 +556,23 @@ namespace ck float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - auto r0 = mInvoker->Run(arg.mArgs[0].get(), stream_config); - auto r1 = mInvoker->Run(arg.mArgs[1].get(), stream_config); - auto r2 = mInvoker->Run(arg.mArgs[2].get(), stream_config); - auto r3 = mInvoker->Run(arg.mArgs[3].get(), stream_config); + auto r0 = mScaleInvoker->Run(arg.mScaleArgs[0].get(), stream_config); + auto r1 = mScaleInvoker->Run(arg.mScaleArgs[1].get(), stream_config); + auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config); + auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config); if(arg.mE_grid != nullptr) { auto blockDim = dim3(1024); - auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); - hiptensor::mfma<<>>( - arg.mE_real.get(), arg.mE_imag.get(), arg.mD_real.get(), arg.mD_imag.get(), - ((ComplexE*)arg.mE_grid), arg.element_op.alpha_, arg.element_op.beta_, - arg.elementsE); + auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); + hiptensor::mfma<<>>(arg.mE_real.get(), + arg.mE_imag.get(), + arg.mD_real.get(), + arg.mD_imag.get(), + ((ComplexE*)arg.mE_grid), + arg.element_op.alpha_, + arg.element_op.beta_, + arg.elementsE); //hiptensor::pack<<>>( // arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); } @@ -484,15 +587,16 @@ namespace ck return Run(*dynamic_cast(p_arg), stream_config); } - std::unique_ptr mInvoker; + std::unique_ptr mScaleInvoker; + std::unique_ptr mBilinearInvoker; }; static bool IsSupportedArgument(const Argument& arg) { - return DecompOp::IsSupportedArgument(*(arg.mArgs[0].get())) - && DecompOp::IsSupportedArgument(*(arg.mArgs[1].get())) - && DecompOp::IsSupportedArgument(*(arg.mArgs[2].get())) - && DecompOp::IsSupportedArgument(*(arg.mArgs[3].get())); + return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[0].get())) + && ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[1].get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get())); } // polymorphic @@ -510,10 +614,14 @@ namespace ck // Call the base, then fwd to each arg. this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s); auto* arg = dynamic_cast(p_arg); - this->BaseOperator::SetWorkSpacePointer(arg->mArgs[0].get(), p_workspace, s); - this->BaseOperator::SetWorkSpacePointer(arg->mArgs[1].get(), p_workspace, s); - this->BaseOperator::SetWorkSpacePointer(arg->mArgs[2].get(), p_workspace, s); - this->BaseOperator::SetWorkSpacePointer(arg->mArgs[3].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mScaleArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mScaleArgs[1].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[1].get(), p_workspace, s); } static auto MakeArgument( @@ -531,7 +639,7 @@ namespace ck const std::vector& e_ms_ns_strides, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CDEElementwiseOperation cde_element_op) + BilinearCDEElementwiseOperation cde_element_op) { return Argument{p_a, p_b, @@ -571,7 +679,7 @@ namespace ck const std::vector& e_ms_ns_strides, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - CDEElementwiseOperation cde_element_op) override + BilinearCDEElementwiseOperation cde_element_op) override { return std::make_unique(p_a, p_b, diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp index 47b84e2c..b875db3b 100644 --- a/library/src/contraction/device/device_contraction_scale_complex.hpp +++ b/library/src/contraction/device/device_contraction_scale_complex.hpp @@ -43,10 +43,10 @@ namespace ck using hiptensor::DeviceDeleter; using hiptensor::elementSpaceFromLengthsAndStrides; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex; - using Scale = ck::tensor_operation::element_wise::Scale; - using ScaleComplex = ck::tensor_operation::element_wise::ScaleComplex; + using Bilinear = ck::tensor_operation::element_wise::Bilinear; + using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex; + using Scale = ck::tensor_operation::element_wise::Scale; + using ScaleComplex = ck::tensor_operation::element_wise::ScaleComplex; // The following is a specialization class for bilinear contractions of complex types. // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of @@ -307,8 +307,8 @@ namespace ck using BilinearDecompArgument = typename BilinearDecompOp::Argument; Argument(Argument&& other) - : mScaleArgs({std::move(other.mScaleArgs[0]), - std::move(other.mScaleArgs[1])}) + : mScaleArgs( + {std::move(other.mScaleArgs[0]), std::move(other.mScaleArgs[1])}) , mBilinearArgs({std::move(other.mBilinearArgs[0]), std::move(other.mBilinearArgs[1])}) { @@ -318,10 +318,10 @@ namespace ck { if(this != &other) { - mScaleArgs[0] = std::move(other.mScaleArgs[0]); - mScaleArgs[1] = std::move(other.mScaleArgs[1]); - mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); - mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); + mScaleArgs[0] = std::move(other.mScaleArgs[0]); + mScaleArgs[1] = std::move(other.mScaleArgs[1]); + mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); + mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); } return *this; } @@ -340,7 +340,8 @@ namespace ck const std::vector& e_ms_ns_strides, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, - ScaleCDEElementwiseOperation cde_element_op) : element_op(cde_element_op) + ScaleCDEElementwiseOperation cde_element_op) + : element_op(cde_element_op) { // Take the incoming arguments, treat them as complex. @@ -359,7 +360,7 @@ namespace ck mE_real.reset(nullptr); mE_imag.reset(nullptr); - mE_grid = p_e_grid; + mE_grid = p_e_grid; auto blockDim = dim3(1024); auto decompGrid = [blockDim](auto& out_r, @@ -392,8 +393,6 @@ namespace ck a_ms_ks_strides, b_ns_ks_lengths, b_ns_ks_strides, - ds_ms_ns_lengths, - ds_ms_ns_strides, e_ms_ns_lengths, e_ms_ns_strides, a_element_op, @@ -410,8 +409,8 @@ namespace ck a_ms_ks_strides, b_ns_ks_lengths, b_ns_ks_strides, - ds_ms_ns_lengths, - ds_ms_ns_strides, + std::array, 0>{}, + std::array, 0>{}, e_ms_ns_lengths, e_ms_ns_strides, a_element_op, @@ -423,8 +422,6 @@ namespace ck a_ms_ks_strides, b_ns_ks_lengths, b_ns_ks_strides, - ds_ms_ns_lengths, - ds_ms_ns_strides, e_ms_ns_lengths, e_ms_ns_strides, a_element_op, @@ -451,22 +448,23 @@ namespace ck cde_element_op); }; - mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f}); - mBilinearArgs[0] = allocBilinearArgs( - mE_real, - mA_imag, - mB_imag, - mE_real, - DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f}); - - mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f}); - mBilinearArgs[1] = allocBilinearArgs( - mE_imag, - mA_imag, - mB_real, - mE_imag, - DecompBilinearCDEElementwiseOperation{1.0f, 1.0f}); - + mScaleArgs[0] = allocScaleArgs( + mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[0] + = allocBilinearArgs(mE_real, + mA_imag, + mB_imag, + mE_real, + DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f}); + + mScaleArgs[1] = allocScaleArgs( + mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[1] + = allocBilinearArgs(mE_imag, + mA_imag, + mB_real, + mE_imag, + DecompBilinearCDEElementwiseOperation{1.0f, 1.0f}); // TODO UNCOMMENT WHEN DONE // original @@ -507,16 +505,16 @@ namespace ck using DeviceArray = std::unique_ptr; // Manage extra memory for AOS->SOA - DeviceArray mA_real; - DeviceArray mA_imag; - DeviceArray mB_real; - DeviceArray mB_imag; - DeviceArray mE_real; - DeviceArray mE_imag; + DeviceArray mA_real; + DeviceArray mA_imag; + DeviceArray mB_real; + DeviceArray mB_imag; + DeviceArray mE_real; + DeviceArray mE_imag; ScaleCDEElementwiseOperation element_op; - void* mE_grid; - index_t elementsE; + void* mE_grid; + index_t elementsE; }; // Invoker @@ -557,10 +555,13 @@ namespace ck if(arg.mE_grid != nullptr) { auto blockDim = dim3(1024); - auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); + auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); - hiptensor::multiply<<>>( - arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.element_op.scale_, arg.elementsE); + hiptensor::multiply<<>>(arg.mE_real.get(), + arg.mE_imag.get(), + ((ComplexE*)arg.mE_grid), + arg.element_op.scale_, + arg.elementsE); //hiptensor::pack<<>>( // arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); } @@ -602,7 +603,8 @@ namespace ck // Call the base, then fwd to each arg. this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s); auto* arg = dynamic_cast(p_arg); - this->BaseOperator::SetWorkSpacePointer(arg->mScaleArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mScaleArgs[0].get(), p_workspace, s); this->BaseOperator::SetWorkSpacePointer( arg->mScaleArgs[1].get(), p_workspace, s); this->BaseOperator::SetWorkSpacePointer( @@ -722,4 +724,3 @@ namespace ck } // namespace ck #endif // HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP - diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index d063ebf5..eb7d8919 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -582,9 +582,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - auto alphaValue - = hiptensor::readVal(alpha, plan->mContractionDesc.mComputeType); - snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%.6lf", alphaValue); + auto alphaValue = hiptensor::readVal( + alpha, plan->mContractionDesc.mComputeType); + snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%s", std::to_string(alphaValue).c_str()); } if(beta == nullptr) @@ -593,8 +593,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - auto betaValue = hiptensor::readVal(beta, plan->mContractionDesc.mComputeType); - snprintf(betaMsg, sizeof(betaMsg), "beta=%.6lf", betaValue); + auto betaValue = hiptensor::readVal( + beta, plan->mContractionDesc.mComputeType); + snprintf(betaMsg, sizeof(betaMsg), "beta=%s", std::to_string(betaValue).c_str()); } } else diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp index 69e29b50..5a31a91f 100644 --- a/library/src/data_types.cpp +++ b/library/src/data_types.cpp @@ -327,3 +327,19 @@ bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType) { return !(computeType == hipType); } + +namespace std +{ + std::string to_string(const hiptensor::ScalarData& value) + { + if(value.mType == HIPTENSOR_COMPUTE_C32F || value.mType == HIPTENSOR_COMPUTE_C64F) + { + return string() + "[" + to_string(value.mComplex.x) + ", " + to_string(value.mComplex.y) + + "]"; + } + else + { + return to_string(value.mReal); + } + } +} diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp index 900b2069..db9ff6c7 100644 --- a/library/src/include/data_types.hpp +++ b/library/src/include/data_types.hpp @@ -115,6 +115,11 @@ bool operator==(hiptensorComputeType_t computeType, hipDataType hipType); bool operator!=(hipDataType hipType, hiptensorComputeType_t computeType); bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType); +namespace std +{ + std::string to_string(const hiptensor::ScalarData& value); +} + #include "data_types_impl.hpp" #endif // HIPTENSOR_LIBRARY_DATA_TYPES_HPP diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp index b0348b91..95c5d0f6 100644 --- a/samples/01_contraction/simple_bilinear_contraction.hpp +++ b/samples/01_contraction/simple_bilinear_contraction.hpp @@ -37,25 +37,12 @@ template -int bilinearContractionSample() +int bilinearContractionSample(void* alpha, void* beta) { - computeDataType alpha, beta; - if constexpr(std::is_same_v || std::is_same_v) - { - alpha = computeDataType(1.0, 1.0); - beta = computeDataType(1.0, 1.0); - } - else - { - alpha = (computeDataType)1.0f; - beta = (computeDataType)1.0f; - } - /********************** * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * *C_{m,n,u,v} @@ -280,17 +267,8 @@ int bilinearContractionSample() std::cout << "Launching contraction kernel..." << std::endl; - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - (void*)&beta, - C_d, - C_d, - workspace, - worksize, - 0 /* stream */)); + CHECK_HIPTENSOR_ERROR(hiptensorContraction( + handle, &plan, alpha, A_d, B_d, beta, C_d, C_d, workspace, worksize, 0 /* stream */)); #if !NDEBUG bool printElements = false; diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp index f6714a2f..52915200 100644 --- a/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp @@ -46,12 +46,13 @@ int main(int argc, char* argv[]) constexpr hipDataType typeC = HIP_R_16BF; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; return bilinearContractionSample(); + typeCompute>(&alpha, &beta); } diff --git a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp index 648675f6..5b3bb7cc 100644 --- a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp @@ -39,19 +39,20 @@ int main(int argc, char* argv[]) typedef hipFloatComplex ADataType; typedef hipFloatComplex BDataType; typedef hipFloatComplex CDataType; - typedef hipFloatComplex ComputeDataType; + typedef hipFloatComplex floatTypeCompute; constexpr hipDataType typeA = HIP_C_32F; constexpr hipDataType typeB = HIP_C_32F; constexpr hipDataType typeC = HIP_C_32F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F; + floatTypeCompute alpha{1.0f, 1.0f}; + floatTypeCompute beta{1.0f, 1.0f}; return bilinearContractionSample(); + typeCompute>(&alpha, &beta); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp index 40708c77..8de0c534 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp @@ -46,12 +46,13 @@ int main(int argc, char* argv[]) constexpr hipDataType typeC = HIP_R_16F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; return bilinearContractionSample(); + typeCompute>(&alpha, &beta); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp index 42f60ecb..6ce6d3c0 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp @@ -46,12 +46,13 @@ int main(int argc, char* argv[]) constexpr hipDataType typeC = HIP_R_32F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; return bilinearContractionSample(); + typeCompute>(&alpha, &beta); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp index d39a4fca..d4e28761 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp @@ -46,12 +46,13 @@ int main(int argc, char* argv[]) constexpr hipDataType typeC = HIP_R_32F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; return bilinearContractionSample(); + typeCompute>(&alpha, &beta); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp index ee046145..e493f1c3 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp @@ -46,12 +46,13 @@ int main(int argc, char* argv[]) constexpr hipDataType typeC = HIP_R_32F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; return bilinearContractionSample(); + typeCompute>(&alpha, &beta); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp index 673c4768..0faffc3e 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp @@ -46,12 +46,13 @@ int main(int argc, char* argv[]) constexpr hipDataType typeC = HIP_R_64F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; return bilinearContractionSample(); + typeCompute>(&alpha, &beta); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp index 412ebbc5..d5024eba 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp @@ -46,12 +46,13 @@ int main(int argc, char* argv[]) constexpr hipDataType typeC = HIP_R_64F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; return bilinearContractionSample(); + typeCompute>(&alpha, &beta); } diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp index 45914e30..5db4598d 100644 --- a/samples/01_contraction/simple_scale_contraction.hpp +++ b/samples/01_contraction/simple_scale_contraction.hpp @@ -37,23 +37,12 @@ template -int scaleContractionSample() +int scaleContractionSample(void* alpha) { - computeDataType alpha; - if constexpr(std::is_same_v || std::is_same_v) - { - alpha = computeDataType(1.0, 1.0); - } - else - { - alpha = (computeDataType)1.0f; - } - /********************** * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} **********************/ @@ -272,7 +261,7 @@ int scaleContractionSample() CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, &plan, - (void*)&alpha, + alpha, A_d, B_d, nullptr, diff --git a/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp index 7b0f8b6c..5a991dbc 100644 --- a/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp +++ b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp @@ -40,12 +40,12 @@ int main(int argc, char* argv[]) constexpr hipDataType typeD = HIP_R_16BF; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + floatTypeCompute alpha = 1; return scaleContractionSample(); + typeCompute>(&alpha); } diff --git a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp index 0f6eaac3..a3eb5e6f 100644 --- a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp +++ b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp @@ -39,19 +39,19 @@ int main(int argc, char* argv[]) typedef hipFloatComplex ADataType; typedef hipFloatComplex BDataType; typedef hipFloatComplex DDataType; - typedef hipFloatComplex ComputeDataType; + typedef hipFloatComplex floatTypeCompute; constexpr hipDataType typeA = HIP_C_32F; constexpr hipDataType typeB = HIP_C_32F; constexpr hipDataType typeD = HIP_C_32F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F; + floatTypeCompute alpha(1, 1); return scaleContractionSample(); + typeCompute>(&alpha); } diff --git a/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp index d69193f0..9283283b 100644 --- a/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp +++ b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp @@ -46,12 +46,12 @@ int main(int argc, char* argv[]) constexpr hipDataType typeD = HIP_R_16F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + floatTypeCompute alpha = 1; return scaleContractionSample(); + typeCompute>(&alpha); } diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp index c11b8ded..dac5e18b 100644 --- a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp @@ -47,12 +47,12 @@ int main(int argc, char* argv[]) constexpr hipDataType typeD = HIP_R_32F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + floatTypeCompute alpha = floatTypeCompute{1.0f}; return scaleContractionSample(); + typeCompute>(&alpha); } diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp index 377ee707..155f9585 100644 --- a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp @@ -47,12 +47,12 @@ int main(int argc, char* argv[]) constexpr hipDataType typeD = HIP_R_32F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + floatTypeCompute alpha = 1; return scaleContractionSample(); + typeCompute>(&alpha); } diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp index e53cc468..2def291d 100644 --- a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp @@ -47,12 +47,12 @@ int main(int argc, char* argv[]) constexpr hipDataType typeD = HIP_R_32F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + floatTypeCompute alpha = 1; return scaleContractionSample(); + typeCompute>(&alpha); } diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp index fdec48ab..7b2a9c95 100644 --- a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp +++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp @@ -46,12 +46,12 @@ int main(int argc, char* argv[]) constexpr hipDataType typeD = HIP_R_64F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + floatTypeCompute alpha = 1; return scaleContractionSample(); + typeCompute>(&alpha); } diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp index 5eb94c15..201741e9 100644 --- a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp +++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp @@ -46,12 +46,12 @@ int main(int argc, char* argv[]) constexpr hipDataType typeD = HIP_R_64F; constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + floatTypeCompute alpha = 1; return scaleContractionSample(); + typeCompute>(&alpha); } diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml index 0d59c05d..b9fe7876 100644 --- a/test/01_contraction/configs/complex_bilinear_test_params.yaml +++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml @@ -16,11 +16,11 @@ Worksize Prefs: Alphas: - [0, 0] - [1, 1] - - [1, 1] + - [1.1, 1.2] Betas: - [2, 2] - [0, 0] - - [2, 2] + - [2.2, 2.3] Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] diff --git a/test/01_contraction/configs/complex_scale_test_params.yaml b/test/01_contraction/configs/complex_scale_test_params.yaml index 89f9e736..355a5050 100644 --- a/test/01_contraction/configs/complex_scale_test_params.yaml +++ b/test/01_contraction/configs/complex_scale_test_params.yaml @@ -16,11 +16,11 @@ Worksize Prefs: Alphas: - [0, 0] - [1, 1] - - [1, 1] + - [1.1, 1.2] Betas: - [2, 2] - [0, 0] - - [2, 2] + - [2.2, 2.3] Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] diff --git a/test/device/common.hpp b/test/device/common.hpp index 392c74c9..283a9035 100644 --- a/test/device/common.hpp +++ b/test/device/common.hpp @@ -84,7 +84,7 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed) } else { - auto value = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize; + auto value = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize; data[index] = static_cast(value); } } From 48abe4a97bb2d567a022eb191ef9d3f1467c3c73 Mon Sep 17 00:00:00 2001 From: Meena Karunanidhi Date: Wed, 27 Dec 2023 13:29:43 -0500 Subject: [PATCH 41/42] Cleanup --- .../src/contraction/contraction_pack_util.hpp | 22 ------------------- .../device_contraction_bilinear_complex.hpp | 19 ---------------- .../device_contraction_scale_complex.hpp | 19 ---------------- 3 files changed, 60 deletions(-) diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp index 237e9d7f..5032fa8a 100644 --- a/library/src/contraction/contraction_pack_util.hpp +++ b/library/src/contraction/contraction_pack_util.hpp @@ -119,28 +119,6 @@ namespace hiptensor } } - /** - * \brief This function packs non-structured data (float / double) - * into structured data (hipFloatComplex / hipDoubleComplex). - */ - template - __global__ void pack(const InputType* in_real, InputType* in_img, OutputType *out, int length) - { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - - if(idx < length) - { - if constexpr(std::is_same_v) - { - out[idx] = make_hipFloatComplex((float)in_real[idx], (float)in_img[idx]); - } - else if constexpr(std::is_same_v) - { - out[idx] = make_hipDoubleComplex((double)in_real[idx], (double)in_img[idx]); - } - } - } - struct DeviceDeleter { void operator()(void* ptr) diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp index 712ff3b0..307ecb1c 100644 --- a/library/src/contraction/device/device_contraction_bilinear_complex.hpp +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -472,23 +472,6 @@ namespace ck mB_real, mE_imag, DecompBilinearCDEElementwiseOperation{1.0f, 1.0f}); - - // TODO UNCOMMENT WHEN DONE - // original - /*mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op); - mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op); - mBilinearArgs[0] = allocBilinearArgs( - mE_real, - mA_imag, - mB_imag, - mE_real, - BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f}); - mBilinearArgs[1] = allocBilinearArgs( - mE_imag, - mA_imag, - mB_real, - mE_imag, - BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});*/ } void Print() const @@ -573,8 +556,6 @@ namespace ck arg.element_op.alpha_, arg.element_op.beta_, arg.elementsE); - //hiptensor::pack<<>>( - // arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); } return r0 + r1 + r2 + r3; diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp index b875db3b..5b70cc11 100644 --- a/library/src/contraction/device/device_contraction_scale_complex.hpp +++ b/library/src/contraction/device/device_contraction_scale_complex.hpp @@ -465,23 +465,6 @@ namespace ck mB_real, mE_imag, DecompBilinearCDEElementwiseOperation{1.0f, 1.0f}); - - // TODO UNCOMMENT WHEN DONE - // original - /*mScaleArgs[0] = allocScaleArgs(mE_real, mA_real, mB_real, cde_element_op); - mScaleArgs[1] = allocScaleArgs(mE_imag, mA_real, mB_imag, cde_element_op); - mBilinearArgs[0] = allocBilinearArgs( - mE_real, - mA_imag, - mB_imag, - mE_real, - BilinearCDEElementwiseOperation{cde_element_op.scale_ * -1.0f, 1.0f}); - mBilinearArgs[1] = allocBilinearArgs( - mE_imag, - mA_imag, - mB_real, - mE_imag, - BilinearCDEElementwiseOperation{cde_element_op.scale_, 1.0f});*/ } void Print() const @@ -562,8 +545,6 @@ namespace ck ((ComplexE*)arg.mE_grid), arg.element_op.scale_, arg.elementsE); - //hiptensor::pack<<>>( - // arg.mE_real.get(), arg.mE_imag.get(), ((ComplexE*)arg.mE_grid), arg.elementsE); } return r0 + r1 + r2 + r3; From 348e28144e5f2502aed9c00aac9999ceb1e2cd29 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Fri, 29 Dec 2023 01:34:39 +0000 Subject: [PATCH 42/42] Set unit test difference threshold to epsilon of compute type - New single kernel selection. To be improved. - Used instance selected by brute force to compute tensor with 1 as the most right stride - Fixed bug that used data type id as compute data type id --- .../src/contraction/contraction_selection.cpp | 1404 ++++------------- .../permutation_cpu_reference_impl.hpp | 2 +- .../configs/bilinear_test_params.yaml | 4 +- .../configs/complex_bilinear_test_params.yaml | 4 +- .../configs/complex_scale_test_params.yaml | 4 +- .../configs/scale_test_params.yaml | 4 +- test/01_contraction/contraction_test.cpp | 28 +- .../permutation_cpu_impl_test.cpp | 6 +- test/02_permutation/permutation_resource.cpp | 2 +- test/02_permutation/permutation_test.cpp | 6 +- test/utils.hpp | 75 +- 11 files changed, 408 insertions(+), 1131 deletions(-) diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index 1f7b70a6..f96e8412 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -204,8 +204,7 @@ namespace hiptensor size_t unique_id = 0; - // TODO select unique_id - unique_id = 7255639152084218514ull; + unique_id = 11124293857315312720ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -253,8 +252,7 @@ namespace hiptensor size_t unique_id = 0; - // TODO select unique_id - unique_id = 7255639152084218514ull; + unique_id = 1953020431947874122ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -302,8 +300,7 @@ namespace hiptensor size_t unique_id = 0; - // TODO select unique_id - unique_id = 8689089455041651212ull; + unique_id = 14895098881714635802ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -351,8 +348,7 @@ namespace hiptensor size_t unique_id = 0; - // TODO select unique_id - unique_id = 8689089455041651212ull; + unique_id = 8517235228581081946ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -395,8 +391,7 @@ namespace hiptensor size_t unique_id = 0; - // TODO select unique_id - unique_id = 1078559130597702989ull; + unique_id = 17313709378682913599ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -438,8 +433,8 @@ namespace hiptensor int d6 = a_ms_ks_lengths[3]; size_t unique_id = 0; - // TODO select unique_id - unique_id = 6506383527825239632ull; + + unique_id = 14397647188602189900ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -481,8 +476,8 @@ namespace hiptensor int d6 = a_ms_ks_lengths[3]; size_t unique_id = 0; - // TODO select unique_id - unique_id = 14486135440731032454ull; + + unique_id = 8339198051871565944ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -529,8 +524,8 @@ namespace hiptensor int d6 = a_ms_ks_lengths[3]; size_t unique_id = 0; - // TODO select unique_id - unique_id = 11931735240548010466ull; + + unique_id = 2724417728984064737ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -573,329 +568,50 @@ namespace hiptensor size_t unique_id = 0; - if(d6 <= 43) + unique_id = 5943247903036531691ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { - if(d5 <= 61) - { - if(d3 <= 236) - { - if(d4 <= 519) - { - if(d1 <= 744) - { - if(d6 <= 8) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 17304057348073251997ull; - } - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - if(d3 <= 32) - { - unique_id = 17304057348073251997ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - } - else - { - if(d6 <= 2) - { - if(d5 <= 15) - { - unique_id = 17618515137355245877ull; - } - else - { - if(d6 <= 1) - { - unique_id = 10830479759059230274ull; - } - else - { - if(d5 <= 32) - { - unique_id = 10830479759059230274ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - } - } - else - { - if(d5 <= 2) - { - if(d6 <= 8) - { - unique_id = 17618515137355245877ull; - } - else - { - unique_id = 10830479759059230274ull; - } - } - else - { - if(d1 <= 54) - { - unique_id = 17304057348073251997ull; - } - else - { - if(d4 <= 218) - { - if(d5 <= 36) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d6 <= 31) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - else - { - if(d2 <= 50) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d6 <= 31) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d6 <= 32) - { - unique_id = 10830479759059230274ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - } - } - } - } - } - } - } - else - { - if(d6 <= 18) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d4 <= 557) - { - if(d2 <= 165) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - else - { - if(d5 <= 68) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - } + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; } else { - if(d5 <= 24) - { - if(d3 <= 435) - { - if(d5 <= 7) - { - if(d5 <= 1) - { - unique_id = 3454820663416883703ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - if(d1 <= 744) - { - unique_id = 17304057348073251997ull; - } - else - { - if(d6 <= 60) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 17304057348073251997ull; - } - } - } - } - else - { - if(d5 <= 1) - { - unique_id = 3454820663416883703ull; - } - else - { - if(d5 <= 13) - { - if(d5 <= 7) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - if(d6 <= 58) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d1 <= 642) - { - unique_id = 17304057348073251997ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - } - } - } - else - { - if(d6 <= 54) - { - if(d5 <= 37) - { - if(d4 <= 556) - { - unique_id = 16481146763982821264ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - if(d1 <= 222) - { - if(d4 <= 556) - { - unique_id = 16481146763982821264ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - else - { - if(d4 <= 44) - { - if(d3 <= 436) - { - unique_id = 17304057348073251997ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - else - { - if(d1 <= 220) - { - if(d2 <= 107) - { - unique_id = 17304057348073251997ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - else - { - if(d3 <= 72) - { - unique_id = 16481146763982821264ull; - } - else - { - if(d2 <= 18) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - } - } - } + return HIPTENSOR_STATUS_EXECUTION_FAILED; } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 17972447156160297755ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -910,7 +626,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -929,6 +645,7 @@ namespace hiptensor std::vector const& e_ms_ns_strides, const uint64_t workspaceSize) { + int d1 = a_ms_ks_lengths[0]; int d2 = a_ms_ks_lengths[1]; int d3 = b_ns_ks_lengths[0]; @@ -938,322 +655,49 @@ namespace hiptensor size_t unique_id = 0; - if(d6 <= 9) + unique_id = 3893144338697524749ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { - if(d6 <= 4) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d5 <= 16) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d2 <= 196) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d1 <= 113) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d3 <= 219) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - } - } + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; } else { - if(d5 <= 8) - { - if(d6 <= 28) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d5 <= 2) - { - if(d6 <= 58) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d5 <= 1) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - else - { - if(d2 <= 163) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d1 <= 465) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - } - } - else - { - if(d3 <= 121) - { - if(d4 <= 483) - { - if(d6 <= 29) - { - if(d5 <= 32) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 222393107113976106ull; - } - } - else - { - if(d5 <= 39) - { - unique_id = 222393107113976106ull; - } - else - { - if(d2 <= 152) - { - unique_id = 222393107113976106ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - } - else - { - if(d3 <= 37) - { - unique_id = 222393107113976106ull; - } - else - { - if(d6 <= 29) - { - if(d5 <= 32) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - else - { - if(d4 <= 135) - { - if(d3 <= 413) - { - if(d6 <= 30) - { - if(d5 <= 32) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 222393107113976106ull; - } - } - else - { - if(d5 <= 39) - { - unique_id = 222393107113976106ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - else - { - if(d4 <= 36) - { - unique_id = 222393107113976106ull; - } - else - { - if(d2 <= 120) - { - unique_id = 222393107113976106ull; - } - else - { - if(d6 <= 32) - { - if(d5 <= 32) - { - unique_id = 13257779901106960809ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - } - else - { - if(d2 <= 115) - { - if(d6 <= 40) - { - if(d2 <= 51) - { - unique_id = 222393107113976106ull; - } - else - { - if(d5 <= 32) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d4 <= 486) - { - unique_id = 222393107113976106ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - else - { - if(d1 <= 235) - { - unique_id = 222393107113976106ull; - } - else - { - if(d2 <= 22) - { - unique_id = 222393107113976106ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - else - { - if(d6 <= 32) - { - if(d5 <= 26) - { - if(d6 <= 23) - { - if(d1 <= 116) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - else - { - if(d5 <= 18) - { - unique_id = 13257779901106960809ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - else - { - if(d5 <= 32) - { - if(d6 <= 16) - { - unique_id = 13257779901106960809ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - } + return HIPTENSOR_STATUS_EXECUTION_FAILED; } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + unique_id = 15165261158317928321ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1268,7 +712,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1296,8 +740,8 @@ namespace hiptensor int d6 = a_ms_ks_lengths[3]; size_t unique_id = 0; - // TODO select unique_id - unique_id = 11912251726020349830ull; + + unique_id = 14511729289005214097ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1312,7 +756,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1339,8 +783,8 @@ namespace hiptensor int d6 = a_ms_ks_lengths[3]; size_t unique_id = 0; - unique_id = 15375432626310194825ull; - // TODO select unique_id + + unique_id = 3636246152928348445ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1355,7 +799,12 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1384,238 +833,55 @@ namespace hiptensor size_t unique_id = 0; - if(d5 <= 36) + unique_id = 5711776907278244209ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { - if(d6 <= 35) - { - if(d1 <= 763) - { - if(d6 <= 3) - { - if(d5 <= 8) - { - unique_id = 9769367948782541618ull; - } - else - { - unique_id = 3344638327382374968ull; - } - } - else - { - unique_id = 3344638327382374968ull; - } - } - else - { - if(d6 <= 24) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d5 <= 17) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - } - } - else - { - if(d5 <= 9) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d1 <= 759) - { - if(d6 <= 67) - { - if(d3 <= 535) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d4 <= 615) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - } - else - { - if(d5 <= 25) - { - if(d4 <= 428) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - unique_id = 16588612317409292216ull; - } - } - } - else - { - if(d6 <= 64) - { - if(d3 <= 65) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - if(d5 <= 25) - { - unique_id = 2770278462698889442ull; - } - else - { - unique_id = 16588612317409292216ull; - } - } - } - } - } + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; } else { - if(d6 <= 33) - { - if(d6 <= 8) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d2 <= 565) - { - if(d1 <= 646) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d6 <= 27) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d5 <= 53) - { - unique_id = 2770278462698889442ull; - } - else - { - unique_id = 16588612317409292216ull; - } - } - } - } - else - { - if(d6 <= 20) - { - if(d3 <= 168) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - if(d5 <= 64) - { - if(d1 <= 648) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - if(d6 <= 25) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 16588612317409292216ull; - } - } - } - } - } - } - else - { - if(d5 <= 45) - { - if(d6 <= 50) - { - if(d3 <= 168) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - unique_id = 16588612317409292216ull; - } - } - else - { - if(d6 <= 43) - { - if(d5 <= 52) - { - unique_id = 2770278462698889442ull; - } - else - { - unique_id = 16588612317409292216ull; - } - } - else - { - unique_id = 16588612317409292216ull; - } - } - } + return HIPTENSOR_STATUS_EXECUTION_FAILED; } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 355777364055884033ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1630,7 +896,12 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1649,6 +920,7 @@ namespace hiptensor std::vector const& e_ms_ns_strides, const uint64_t workspaceSize) { + int d1 = a_ms_ks_lengths[0]; int d2 = a_ms_ks_lengths[1]; int d3 = b_ns_ks_lengths[0]; @@ -1658,217 +930,55 @@ namespace hiptensor size_t unique_id = 0; - if(d5 <= 39) + unique_id = 3085227716611397774ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { - if(d3 <= 937) - { - if(d6 <= 1) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d4 <= 754) - { - if(d5 <= 33) - { - if(d5 <= 1) - { - if(d6 <= 25) - { - unique_id = 3423207643344265161ull; - } - else - { - unique_id = 1830537384143755749ull; - } - } - else - { - if(d6 <= 6) - { - if(d5 <= 8) - { - unique_id = 3423207643344265161ull; - } - else - { - unique_id = 1830537384143755749ull; - } - } - else - { - unique_id = 1830537384143755749ull; - } - } - } - else - { - unique_id = 1830537384143755749ull; - } - } - else - { - if(d1 <= 404) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d6 <= 50) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d5 <= 33) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - } - } - } - else - { - unique_id = 1830537384143755749ull; - } + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; } else { - if(d6 <= 32) - { - if(d2 <= 832) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d6 <= 8) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d6 <= 24) - { - unique_id = 17689908062647780665ull; - } - else - { - if(d5 <= 64) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - } - } - else - { - if(d6 <= 46) - { - if(d5 <= 54) - { - if(d1 <= 460) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d5 <= 49) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - else - { - if(d1 <= 182) - { - if(d5 <= 65) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - if(d2 <= 33) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - } - else - { - if(d5 <= 49) - { - if(d6 <= 64) - { - if(d1 <= 411) - { - if(d2 <= 396) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - if(d2 <= 53) - { - if(d1 <= 222) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - } + return HIPTENSOR_STATUS_EXECUTION_FAILED; } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 2196983681630807584ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1901,7 +1011,7 @@ namespace hiptensor const uint64_t workspaceSize) { if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F - && computeType == HIP_R_32F) + && computeType == HIPTENSOR_COMPUTE_32F) { return ActorCriticSelection<_Float16, _Float16, @@ -1925,7 +1035,7 @@ namespace hiptensor workspaceSize); } else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F - && computeType == HIP_R_32F) + && computeType == HIPTENSOR_COMPUTE_32F) { return ActorCriticSelection<_Float16, _Float16, @@ -1949,7 +1059,7 @@ namespace hiptensor workspaceSize); } else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE - && typeE == HIP_R_16BF && computeType == HIP_R_32F) + && typeE == HIP_R_16BF && computeType == HIPTENSOR_COMPUTE_32F) { return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_C_32F && typeB == HIP_C_32F && typeD == HIP_C_32F && typeE == HIP_C_32F + && computeType == HIPTENSOR_COMPUTE_C32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_C_64F && typeB == HIP_C_64F && typeD == NONE_TYPE && typeE == HIP_C_64F + && computeType == HIPTENSOR_COMPUTE_C64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_C_64F && typeB == HIP_C_64F && typeD == HIP_C_64F && typeE == HIP_C_64F + && computeType == HIPTENSOR_COMPUTE_C64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } return HIPTENSOR_STATUS_EXECUTION_FAILED; } } diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp index c1d4a3af..4820274f 100644 --- a/library/src/permutation/permutation_cpu_reference_impl.hpp +++ b/library/src/permutation/permutation_cpu_reference_impl.hpp @@ -92,7 +92,7 @@ namespace hiptensor auto bOffset = std::inner_product(bIndices.rbegin(), bIndices.rend(), bStrides.rbegin(), 0); #endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR - B[bOffset] = static_cast(A[elementIndex] * alphaValue); + B[bOffset] = static_cast(A[elementIndex] * (DataType)alphaValue); } return HIPTENSOR_STATUS_SUCCESS; diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index 1e7999fc..9306445a 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -11,7 +11,7 @@ Tensor Data Types: Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - # - HIPTENSOR_ALGO_ACTOR_CRITIC + - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: @@ -29,7 +29,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 2 ] + - [ 24, 18, 2, 4, 9, 1 ] Strides: - [] ... diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml index b9fe7876..dfbb814e 100644 --- a/test/01_contraction/configs/complex_bilinear_test_params.yaml +++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml @@ -6,7 +6,7 @@ Tensor Data Types: Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - # - HIPTENSOR_ALGO_ACTOR_CRITIC + - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: @@ -24,7 +24,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 2 ] + - [ 24, 18, 2, 4, 9, 1 ] Strides: - [] ... diff --git a/test/01_contraction/configs/complex_scale_test_params.yaml b/test/01_contraction/configs/complex_scale_test_params.yaml index 355a5050..4bad2a9b 100644 --- a/test/01_contraction/configs/complex_scale_test_params.yaml +++ b/test/01_contraction/configs/complex_scale_test_params.yaml @@ -6,7 +6,7 @@ Tensor Data Types: Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - # - HIPTENSOR_ALGO_ACTOR_CRITIC + - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: @@ -24,7 +24,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 2 ] + - [ 24, 18, 2, 4, 9, 1 ] Strides: - [] ... diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index bc8289f5..4c52eeda 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -11,7 +11,7 @@ Tensor Data Types: Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - # - HIPTENSOR_ALGO_ACTOR_CRITIC + - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: @@ -29,7 +29,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 2 ] + - [ 24, 18, 2, 4, 9, 1 ] Strides: - [] ... diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index a75cf7bf..664da2ec 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -628,8 +628,8 @@ namespace hiptensor DDataType, workspace)); - size_t elementsCD = std::accumulate(c_ms_ns.mLengths.begin(), - c_ms_ns.mLengths.end(), + size_t elementsCD = std::accumulate(d_ms_ns.mLengths.begin(), + d_ms_ns.mLengths.end(), size_t{1}, std::multiplies()); @@ -639,8 +639,11 @@ namespace hiptensor if(DDataType == HIP_R_16F) { - std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>( - (_Float16*)resource->deviceD().get(), (_Float16*)reference.get(), elementsCD); + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(), + (_Float16*)reference.get(), + elementsCD, + computeType); } else if(DDataType == HIP_R_16BF) { @@ -648,17 +651,24 @@ namespace hiptensor = compareEqualLaunchKernel( (hip_bfloat16*)resource->deviceD().get(), (hip_bfloat16*)reference.get(), - elementsCD); + elementsCD, + computeType); } else if(DDataType == HIP_R_32F || DDataType == HIP_C_32F) { - std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( - (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD); + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel((float*)resource->deviceD().get(), + (float*)reference.get(), + elementsCD, + computeType); } else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F) { - std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( - (double*)resource->deviceD().get(), (double*)reference.get(), elementsCD); + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel((double*)resource->deviceD().get(), + (double*)reference.get(), + elementsCD, + computeType); } EXPECT_TRUE(mValidationResult) << "Max relative error: " << mMaxRelativeError; diff --git a/test/02_permutation/permutation_cpu_impl_test.cpp b/test/02_permutation/permutation_cpu_impl_test.cpp index 014dbc61..5a885f0b 100644 --- a/test/02_permutation/permutation_cpu_impl_test.cpp +++ b/test/02_permutation/permutation_cpu_impl_test.cpp @@ -125,7 +125,11 @@ auto permuteWithCpu(hipDataType typeA, hipDataType typeB, hipDataType typeComput &descB, modeB.data(), typeCompute); - return compareEqual(referenceArray.data(), bArray.data(), bArray.size(), 10); + return compareEqual(referenceArray.data(), + bArray.data(), + bArray.size(), + hiptensor::convertToComputeType(typeCompute), + 10); } TEST(PermutationCpuImplTest, CompareF32ResultWithReference) diff --git a/test/02_permutation/permutation_resource.cpp b/test/02_permutation/permutation_resource.cpp index 1f448ff8..6acd7577 100644 --- a/test/02_permutation/permutation_resource.cpp +++ b/test/02_permutation/permutation_resource.cpp @@ -72,7 +72,7 @@ namespace hiptensor mCurrentAllocByte = requiredMemorySize; needFillData = true; } - else if(mCurrentDataType != dataType) + if(mCurrentDataType != dataType || mCurrentMatrixElement < requiredElementCount) { needFillData = true; } diff --git a/test/02_permutation/permutation_test.cpp b/test/02_permutation/permutation_test.cpp index cfadf5c0..078c78a4 100644 --- a/test/02_permutation/permutation_test.cpp +++ b/test/02_permutation/permutation_test.cpp @@ -257,7 +257,8 @@ namespace hiptensor std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel((float*)resource->deviceB().get(), (float*)resource->deviceReference().get(), - resource->getCurrentMatrixElement()); + resource->getCurrentMatrixElement(), + convertToComputeType(computeDataType)); } else if(abDataType == HIP_R_16F) { @@ -273,7 +274,8 @@ namespace hiptensor std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>( (_Float16*)resource->deviceB().get(), (_Float16*)resource->deviceReference().get(), - resource->getCurrentMatrixElement()); + resource->getCurrentMatrixElement(), + convertToComputeType(computeDataType)); } } diff --git a/test/utils.hpp b/test/utils.hpp index ad4bb565..fc999738 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -57,6 +57,59 @@ CHECK_HIP_ERROR(hipHostFree(ptr)); \ } +inline double getEpsilon(hiptensorComputeType_t id) +{ + auto toDouble = [](auto const& val) { return static_cast(static_cast(val)); }; + + if(id == HIPTENSOR_COMPUTE_16F) + { + return toDouble(std::numeric_limits<_Float16>::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_16BF) + { + return toDouble(std::numeric_limits::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_32F) + { + return toDouble(std::numeric_limits::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_64F) + { + return toDouble(std::numeric_limits::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_8U) + { + return 0; + } + else if(id == HIPTENSOR_COMPUTE_8I) + { + return 0; + } + else if(id == HIPTENSOR_COMPUTE_32U) + { + return 0; + } + else if(id == HIPTENSOR_COMPUTE_32I) + { + return 0; + } + else if(id == HIPTENSOR_COMPUTE_C32F) + { + return toDouble(std::numeric_limits::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_C64F) + { + return toDouble(std::numeric_limits::epsilon()); + } + else + { +#if !NDEBUG + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; +#endif // !NDEBUG + return 0; + } +} + inline bool isF32Supported() { hipDevice_t mHandle; @@ -137,10 +190,11 @@ __host__ static inline void } template -std::pair compareEqual(DDataType const* deviceD, - DDataType const* hostD, - std::size_t elementsD, - double tolerance = 100.0) +std::pair compareEqual(DDataType const* deviceD, + DDataType const* hostD, + std::size_t elementsD, + hiptensorComputeType_t computeType, + double tolerance = 100.0) { bool retval = true; double max_relative_error = 0.0; @@ -191,7 +245,7 @@ std::pair compareEqual(DDataType const* deviceD, } } - auto eps = toDouble(std::numeric_limits::epsilon()); + auto eps = getEpsilon(computeType); if(isInf) { retval = false; @@ -211,10 +265,11 @@ std::pair compareEqual(DDataType const* deviceD, } template -std::pair compareEqualLaunchKernel(DDataType* deviceD, - DDataType* hostD, - std::size_t elementsD, - double tolerance = 100.0) +std::pair compareEqualLaunchKernel(DDataType* deviceD, + DDataType* hostD, + std::size_t elementsD, + hiptensorComputeType_t computeType, + double tolerance = 100.0) { auto blockDim = dim3(1024, 1, 1); auto gridDim = dim3(ceilDiv(elementsD, blockDim.x), 1, 1); @@ -276,7 +331,7 @@ std::pair compareEqualLaunchKernel(DDataType* deviceD, auto toDouble = [](DDataType const& val) { return static_cast(static_cast(val)); }; - auto eps = toDouble(std::numeric_limits::epsilon()); + auto eps = getEpsilon(computeType); if(isNaN) { retval = false;