diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 95e8b2ba..0e0a252e 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -6,7 +6,7 @@ version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values - directory: "/docs/.sphinx" # Location of package manifests + directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" diff --git a/.gitignore b/.gitignore index ad44a303..9945a9dc 100644 --- a/.gitignore +++ b/.gitignore @@ -50,13 +50,3 @@ build* \#*\# *~ *.log - -# documentation artifacts -build/ -_build/ -_images/ -_static/ -_templates/ -_toc.yml -docBin/ -_doxygen/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e2bf130c..9e6678ab 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,11 +10,9 @@ formats: [htmlzip, pdf, epub] python: install: - - requirements: docs/.sphinx/requirements.txt + - requirements: docs/sphinx/requirements.txt build: - os: ubuntu-20.04 + os: ubuntu-22.04 tools: python: "3.8" - apt_packages: - - "doxygen" diff --git a/README.md b/README.md index f5e55943..5af7912d 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Run the steps below to build documentation locally. ```shell cd docs -pip3 install -r .sphinx/requirements.txt +pip3 install -r sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` @@ -98,21 +98,24 @@ After configuration, build with `cmake --build -- -j` ### Logger tests Tests API implementation of logger verbosity and functionality. -o /bin/logger_test + +* `/bin/logger_test` ## Running Contraction Tests ### Bilinear contraction tests Tests the API implementation of bilinear contraction algorithm with validation. -o /bin/bilinear_contraction_f32_test -o /bin/bilinear_contraction_f64_test + +* `/bin/bilinear_contraction_f32_test` +* `/bin/bilinear_contraction_f64_test` ### Scale contraction tests Tests the API implementation of scale contraction algorithm with validation. -o /bin/scale_contraction_f32_test -o /bin/scale_contraction_f64_test + +* `/bin/scale_contraction_f32_test` +* `/bin/scale_contraction_f64_test` ### Samples @@ -121,12 +124,14 @@ These are stand-alone use-cases of the hipTensor contraction operations. ## F32 Bilinear contraction Demonstrates the API implementation of bilinear contraction operation without validation. -o /bin/simple_contraction_bilinear_f32 + +* `/bin/simple_contraction_bilinear_f32` ## F32 Scale contraction Demonstrates the API implementation of scale contraction operation without validation. -o /bin/simple_contraction_scale_f32 + +* `/bin/simple_contraction_scale_f32` ### Build Samples as external client diff --git a/docs/.gitignore b/docs/.gitignore index a44ccbe0..594c0c8c 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,7 +1,5 @@ -.doxygen/docBin -.sphinx/_toc.yml -_build -_doxygen -_images -_static -_templates \ No newline at end of file +doxygen/html +doxygen/xml +sphinx/_toc.yml +_build/ +_doxygen/ diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in deleted file mode 100644 index 313c5e94..00000000 --- a/docs/.sphinx/requirements.in +++ /dev/null @@ -1 +0,0 @@ -rocm-docs-core>=0.24.0 diff --git a/docs/API_Reference_Guide.rst b/docs/API_Reference_Guide.rst index 551e2ee0..77e86343 100644 --- a/docs/API_Reference_Guide.rst +++ b/docs/API_Reference_Guide.rst @@ -3,15 +3,16 @@ Introduction ************ -hiptensor Data Types +hipTensor Data Types ==================== +.. + hiptensorStatus_t ----------------- .. doxygenenum:: hiptensorStatus_t - hiptensorComputeType_t ---------------------- @@ -160,3 +161,5 @@ hiptensorLoggerForceDisable --------------------------- .. doxygenfunction:: hiptensorLoggerForceDisable + +.. diff --git a/docs/Contributors_Guide.rst b/docs/Contributors_Guide.rst index aeb87211..212248be 100644 --- a/docs/Contributors_Guide.rst +++ b/docs/Contributors_Guide.rst @@ -15,8 +15,7 @@ License Agreement Pull-request guidelines ======================= - -Our code contriubtion guidelines closely follows the model of `GitHub +Our code contribution guidelines closely follows the model of `GitHub pull-requests `__. The hipTensor repository follows a workflow which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code. Pull requests should: @@ -30,8 +29,8 @@ The hipTensor repository follows a workflow which dictates a /master branch wher - code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit. -StyleGuide -========== +Style Guide +=========== This project follows the `CPP Core guidelines `__, @@ -44,7 +43,7 @@ Interface --------- - Library code should use C++17 -- Avoid CamelCase +- Avoid Camel case - This rule applies specifically to publicly visible APIs, but is also encouraged (not mandated) for internal code @@ -52,8 +51,8 @@ Philosophy ---------- - `P.2 `__: - Write in ISO Standard C++14 (especially to support windows, linux and - macos plaforms ) + Write in ISO Standard C++14 (especially to support Windows, Linux and + macOS platforms ) - `P.5 `__: Prefer compile-time checking to run-time checking @@ -105,19 +104,19 @@ will result in different results. To format a file, use: -:: +.. code-block:: - /opt/rocm/llvm/bin/clang-format -style=file -i + /opt/rocm/llvm/bin/clang-format -style=file -i To format all files, run the following script in hipTensor directory: -:: +.. code-block:: - #!/bin/bash - git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i + #!/bin/bash + git ls-files -z *.cc *.cpp *.h *.hpp *.cl *.h.in *.hpp.in *.cpp.in | xargs -0 /opt/rocm/llvm/bin/clang-format -style=file -i Also, githooks can be installed to format the code per-commit: -:: +.. code-block:: - ./.githooks/install + ./.githooks/install diff --git a/docs/Linux_Install_Guide.rst b/docs/Linux_Install_Guide.rst index 47cdc339..ace565c1 100644 --- a/docs/Linux_Install_Guide.rst +++ b/docs/Linux_Install_Guide.rst @@ -104,9 +104,9 @@ Minimum ROCm version support is 5.7. By default, the project is configured as Release mode. -To build only library, run the following comomand : +To build only library, run the following command : - CC=hipcc CXX=hipcc cmake -B . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=OFF + :code:`CC=hipcc CXX=hipcc cmake -B . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=OFF` Here are some other example project configurations: @@ -116,30 +116,30 @@ Here are some other example project configurations: +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ | Configuration | Command | +===================================+====================================================================================================================+ -| Basic | CC=hipcc CXX=hipcc cmake -B . | +| Basic | :code:`CC=hipcc CXX=hipcc cmake -B .` | +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ -| Targeting gfx908 | CC=hipcc CXX=hipcc cmake -B . -DAMDGPU_TARGETS=gfx908:xnack- | +| Targeting gfx908 | :code:`CC=hipcc CXX=hipcc cmake -B . -DAMDGPU_TARGETS=gfx908:xnack-` | +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ -| Debug build | CC=hipcc CXX=hipcc cmake -B . -DCMAKE_BUILD_TYPE=Debug | +| Debug build | :code:`CC=hipcc CXX=hipcc cmake -B . -DCMAKE_BUILD_TYPE=Debug` | +-----------------------------------+--------------------------------------------------------------------------------------------------------------------+ After configuration, build with - cmake --build -- -j + :code:`cmake --build -- -j` Build library + samples ^^^^^^^^^^^^^^^^^^^^^^^ -To build library and samples, run the following comomand : +To build library and samples, run the following command: - CC=hipcc CXX=hipcc cmake -B . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=ON + :code:`CC=hipcc CXX=hipcc cmake -B . -DHIPTENSOR_BUILD_TESTS=OFF -DHIPTENSOR_BUILD_SAMPLES=ON` After configuration, build with - cmake --build -- -j + :code:`cmake --build -- -j` -The samples folder in contains executables in the table below. +The samples folder in :code:`` contains executables in the table below. =================================== =================================================================================== executable name description @@ -154,13 +154,13 @@ Build library + tests To build library and tests, run the following command : - CC=hipcc CXX=hipcc cmake -B . + :code:`CC=hipcc CXX=hipcc cmake -B .` After configuration, build with - cmake --build -- -j + :code:`cmake --build -- -j` -The tests in contains executables in the table below. +The tests in `` contains executables in the table below. ====================================== =================================================================================== executable name description @@ -177,6 +177,7 @@ Build library + Documentation Run the steps below to build documentation locally. +.. code-block:: cd docs sudo apt-get update @@ -191,4 +192,4 @@ Run the steps below to build documentation locally. pdflatex hiptensor.tex -Generates hiptensor.pdf here +Generates :code:`hiptensor.pdf` here diff --git a/docs/Programmers_Guide.rst b/docs/Programmers_Guide.rst index 460bb970..047c1f5a 100644 --- a/docs/Programmers_Guide.rst +++ b/docs/Programmers_Guide.rst @@ -1,4 +1,3 @@ - =================== Programmer's Guide =================== @@ -17,84 +16,84 @@ The hipTensor code is split into four major parts: The `library` directory ^^^^^^^^^^^^^^^^^^^^^^^ -library/include/hiptensor/ -''''''''''''''''''''''''''' +`library/include/hiptensor/` +'''''''''''''''''''''''''''' Contains C++ include files for the hipTensor API. These files also contain Doxygen comments that document the API. -library/include/hiptensor/internal -'''''''''''''''''''''''''''''''''' +`library/include/hiptensor/internal` +'''''''''''''''''''''''''''''''''''' Internal include files for: - Utility Code - Generate Tensor Utility -library/src/ -'''''''''''' +`library/src/` +'''''''''''''' Contains logger, device and performance functions. -library/src/contraction/ -'''''''''''''''''''''''' +`library/src/contraction/` +'''''''''''''''''''''''''' Contains hipTensor core composable kernel header functions and contraction initialization functions. -library/src/contraction/device -'''''''''''''''''''''''''''''' +`library/src/contraction/device` +'''''''''''''''''''''''''''''''' Contains hipTensor Bilinear and Scale instance functions The `samples` directory ^^^^^^^^^^^^^^^^^^^^^^^ -01_contraction/simple_bilinear_contraction_f32.cpp -'''''''''''''''''''''''''''''''''''''''''''''''''' +`01_contraction/simple_bilinear_contraction_f32.cpp` +'''''''''''''''''''''''''''''''''''''''''''''''''''' -sample code for calling bilinear contraction for fp32 input, output and compute types +sample code for calling bilinear contraction for :code:`fp32` input, output and compute types -01_contraction/simple_scale_contraction_f32.cpp -''''''''''''''''''''''''''''''''''''''''''''''' +`01_contraction/simple_scale_contraction_f32.cpp` +''''''''''''''''''''''''''''''''''''''''''''''''' -sample code for calling scale contraction for fp32 input, output and compute types +sample code for calling scale contraction for :code:`fp32` input, output and compute types The `test` directory ^^^^^^^^^^^^^^^^^^^^^^^ -00_unit/logger -'''''''''''''' +`00_unit/logger` +'''''''''''''''' Test code for testing logger API Functions of hipTensor -01_contraction/bilinear_contraction_f32 -''''''''''''''''''''''''''''''''''''''' +`01_contraction/bilinear_contraction_f32` +''''''''''''''''''''''''''''''''''''''''' Test code for testing the bilinear contraction functionality and log metrics for F32 types. -01_contraction/bilinear_contraction_f64 -''''''''''''''''''''''''''''''''''''''' +`01_contraction/bilinear_contraction_f64` +''''''''''''''''''''''''''''''''''''''''' Test code for testing the bilinear contraction functionality and log metrics for F64 types. -01_contraction/scale_contraction_f32 -'''''''''''''''''''''''''''''''''''' +`01_contraction/scale_contraction_f32` +'''''''''''''''''''''''''''''''''''''' Test code for testing the scale contraction functionality and log metrics for F32 types. -01_contraction/scale_contraction_f64 -'''''''''''''''''''''''''''''''''''' +`01_contraction/scale_contraction_f64` +'''''''''''''''''''''''''''''''''''''' Test code for testing the scale contraction functionality and log metrics for F64 types. Infrastructure ^^^^^^^^^^^^^^ -- CMake is used to build and package hipTensor. There are CMakeLists.txt files throughout the code. -- Doxygen/Breathe/Sphinx/ReadTheDocs are used to produce documentation. Content for the documentation is from: +- CMake is used to build and package hipTensor. There are :code:`CMakeLists.txt` files throughout the code. +- `Doxygen/Breathe/Sphinx/ReadtheDocs` are used to produce documentation. Content for the documentation is from: - - Doxygen comments in include files in the directory library/include - - files in the directory docs/ + - Doxygen comments in include files in the directory :code:`library/include` + - files in the directory :code:`docs/` - Jenkins is used to automate Continuous Integration testing. -- clang-format is used to format C++ code. +- :code:`clang-format` is used to format C++ code. diff --git a/docs/conf.py b/docs/conf.py index 4f00fb9e..e7e64d90 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,11 +29,31 @@ # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html +import re + from rocm_docs import ROCmDocs -docs_core = ROCmDocs("hipTensor Documentation") -docs_core.run_doxygen() +with open('../CMakeLists.txt', encoding='utf-8') as f: + match = re.search(r'.*\bset \( VERSION_STRING\s+\"?([0-9.]+)[^0-9.]+', f.read()) + if not match: + raise ValueError("VERSION not found!") + version_number = match[1] +left_nav_title = f"hipTensor {version_number} Documentation" + +# for PDF output on Read the Docs +project = "hipTensor Documentation" +author = "Advanced Micro Devices, Inc." +copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." +version = version_number +release = version_number + +external_toc_path = "./sphinx/_toc.yml" + +docs_core = ROCmDocs(left_nav_title) +docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") docs_core.setup() +external_projects_current_project = "hiptensor" + for sphinx_var in ROCmDocs.SPHINX_VARS: globals()[sphinx_var] = getattr(docs_core, sphinx_var) diff --git a/docs/.doxygen/Doxyfile b/docs/doxygen/Doxyfile similarity index 99% rename from docs/.doxygen/Doxyfile rename to docs/doxygen/Doxyfile index 59a973b7..6f96968a 100644 --- a/docs/.doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = docBin +OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and @@ -786,7 +786,8 @@ WARN_AS_ERROR = YES INPUT = ../../library/include/hiptensor \ ../../library/include/hiptensor/internal \ - ../../library/src + ../../library/src \ + ../../README.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -965,7 +966,7 @@ FILTER_SOURCE_PATTERNS = # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. -USE_MDFILE_AS_MAINPAGE = ../README.md +USE_MDFILE_AS_MAINPAGE = ../../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing @@ -2074,7 +2075,8 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = __device__ +PREDEFINED = __device__ \ + DOXYGEN_SHOULD_SKIP_THIS # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/docs/index.rst b/docs/index.rst index 566a00e5..ba5e1cb7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,6 +1,6 @@ ============================================================================ -hiptensor: A High-Performance HIP Library For Tensor Primitives +hipTensor: A High-Performance HIP Library For Tensor Primitives ============================================================================ -hiptensor is AMD's C++ library for accelerating tensor primitives based on the +hipTensor is AMD's C++ library for accelerating tensor primitives based on the composable kernel library, through general purpose kernel languages, like HIP C++. diff --git a/docs/license.rst b/docs/license.rst new file mode 100644 index 00000000..141b5d3c --- /dev/null +++ b/docs/license.rst @@ -0,0 +1,4 @@ +License +======= + +.. include:: ../LICENSE diff --git a/docs/.sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in similarity index 84% rename from docs/.sphinx/_toc.yml.in rename to docs/sphinx/_toc.yml.in index 37b5a62b..6da76c27 100644 --- a/docs/.sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -8,3 +8,6 @@ subtrees: - file: API_Reference_Guide - file: Programmers_Guide - file: Contributors_Guide + - caption: About + entries: + - file: license diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in new file mode 100644 index 00000000..b80af261 --- /dev/null +++ b/docs/sphinx/requirements.in @@ -0,0 +1 @@ +rocm-docs-core==0.30.3 diff --git a/docs/.sphinx/requirements.txt b/docs/sphinx/requirements.txt similarity index 96% rename from docs/.sphinx/requirements.txt rename to docs/sphinx/requirements.txt index 94103e1a..81f0b559 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -26,7 +26,7 @@ charset-normalizer==3.1.0 # via requests click==8.1.3 # via sphinx-external-toc -cryptography==41.0.4 +cryptography==41.0.6 # via pyjwt deprecated==1.2.13 # via pygithub @@ -40,7 +40,7 @@ fastjsonschema==2.16.3 # via rocm-docs-core gitdb==4.0.10 # via gitpython -gitpython==3.1.35 +gitpython==3.1.37 # via rocm-docs-core idna==3.4 # via requests @@ -84,9 +84,7 @@ pygments==2.15.0 # pydata-sphinx-theme # sphinx pyjwt[crypto]==2.6.0 - # via - # pygithub - # pyjwt + # via pygithub pynacl==1.5.0 # via pygithub pytz==2023.3.post1 @@ -100,7 +98,7 @@ requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core==0.28.0 +rocm-docs-core==0.30.3 # via -r requirements.in smmap==5.0.0 # via gitdb @@ -143,7 +141,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx typing-extensions==4.5.0 # via pydata-sphinx-theme -urllib3==1.26.15 +urllib3==1.26.18 # via requests wrapt==1.15.0 # via deprecated diff --git a/library/include/hiptensor/hiptensor_types.hpp b/library/include/hiptensor/hiptensor_types.hpp index 85a5d90e..ca666a5b 100644 --- a/library/include/hiptensor/hiptensor_types.hpp +++ b/library/include/hiptensor/hiptensor_types.hpp @@ -90,6 +90,8 @@ typedef enum HIPTENSOR_COMPUTE_8I = (1U << 8U), HIPTENSOR_COMPUTE_32U = (1U << 7U), HIPTENSOR_COMPUTE_32I = (1U << 9U), + HIPTENSOR_COMPUTE_C32F = (1U << 11U), + HIPTENSOR_COMPUTE_C64F = (1U << 12U), HIPTENSOR_COMPUTE_NONE = 0 } hiptensorComputeType_t; diff --git a/library/src/include/config.hpp b/library/include/hiptensor/internal/config.hpp similarity index 100% rename from library/src/include/config.hpp rename to library/include/hiptensor/internal/config.hpp diff --git a/library/include/hiptensor/internal/hiptensor-version.hpp.in b/library/include/hiptensor/internal/hiptensor-version.hpp.in index e1942a2b..89247375 100644 --- a/library/include/hiptensor/internal/hiptensor-version.hpp.in +++ b/library/include/hiptensor/internal/hiptensor-version.hpp.in @@ -38,6 +38,15 @@ #define HIPTENSOR_PATCH_VERSION @hiptensor_VERSION_PATCH@ // clang-format on +/** + * \brief Returns the version number of hipTensor + * + * \details Return the version with three least significant digits for patch version, + * the next three digits for minor version, and the most significant digits for major version. + * + * \returns The version number. + */ + inline size_t hiptensorGetVersion() { return HIPTENSOR_MAJOR_VERSION * 1e6 + HIPTENSOR_MINOR_VERSION * 1e3 + HIPTENSOR_PATCH_VERSION; diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp index f2df2dd2..746f1bbf 100644 --- a/library/include/hiptensor/internal/hiptensor_utility.hpp +++ b/library/include/hiptensor/internal/hiptensor_utility.hpp @@ -29,8 +29,10 @@ #include #include #include +#include #include "../hiptensor_types.hpp" +#include "types_ext.hpp" #ifndef CHECK_HIP_ERROR #define CHECK_HIP_ERROR(expression) \ @@ -60,6 +62,20 @@ } #endif +inline std::ostream& operator<<(std::ostream& os, const hipFloatComplex& fc) +{ + std::string seperator = (hipCimagf(fc) >= 0) ? " + " : ""; + + return os << hipCrealf(fc) << seperator << hipCimagf(fc) << "i"; +} + +inline std::ostream& operator<<(std::ostream& os, const hipDoubleComplex& dc) +{ + std::string seperator = (hipCimag(dc) >= 0) ? " + " : ""; + + return os << hipCreal(dc) << seperator << hipCimag(dc) << "i"; +} + template void hiptensorPrintArrayElements(std::ostream& stream, T* vec, size_t size) { diff --git a/library/src/include/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp similarity index 97% rename from library/src/include/native_types.hpp rename to library/include/hiptensor/internal/native_types.hpp index 6c9dbee8..69ce706f 100644 --- a/library/src/include/native_types.hpp +++ b/library/include/hiptensor/internal/native_types.hpp @@ -33,8 +33,6 @@ #include #include -#include "xfloat32.hpp" - namespace hiptensor { @@ -84,9 +82,6 @@ namespace hiptensor #if !HIPTENSOR_NO_HALF using hfloat16_t = __half; #endif // !HIPTENSOR_NO_HALF - - using xfloat32_t = hiptensor_xfloat32; - // clang-format off diff --git a/library/src/include/native_types_impl.hpp b/library/include/hiptensor/internal/native_types_impl.hpp similarity index 100% rename from library/src/include/native_types_impl.hpp rename to library/include/hiptensor/internal/native_types_impl.hpp diff --git a/library/src/include/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp similarity index 80% rename from library/src/include/type_traits.hpp rename to library/include/hiptensor/internal/type_traits.hpp index 3867839d..81bafacd 100644 --- a/library/src/include/type_traits.hpp +++ b/library/include/hiptensor/internal/type_traits.hpp @@ -26,9 +26,11 @@ #ifndef HIPTENSOR_TYPE_TRAITS_HPP #define HIPTENSOR_TYPE_TRAITS_HPP -#include "native_types.hpp" #include +#include "config.hpp" +#include "native_types.hpp" + namespace hiptensor { namespace detail @@ -69,9 +71,8 @@ namespace hiptensor { union { - uint32_t i32; - float32_t f32; - xfloat32_t xf32; + uint32_t i32; + float32_t f32; }; constexpr Fp32Bits(uint32_t initVal) : i32(initVal) @@ -81,10 +82,6 @@ namespace hiptensor : f32(initVal) { } - constexpr Fp32Bits(xfloat32_t initVal) - : xf32(initVal) - { - } }; } // namespace detail @@ -96,6 +93,7 @@ namespace std /////////// std::numeric_limits ////////////// /////////////////////////////////////////////////////////// +#ifndef DOXYGEN_SHOULD_SKIP_THIS template <> HIPTENSOR_HOST_DEVICE constexpr hiptensor::float16_t numeric_limits::epsilon() noexcept @@ -273,68 +271,7 @@ namespace std hiptensor::detail::Fp16Bits eps(static_cast(0x7FC0)); return eps.b16; } - - /////////////////////////////////////////////////////////// - /////////// std::numeric_limits ////////////// - /////////////////////////////////////////////////////////// - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::epsilon() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_EPSILON)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::infinity() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(HUGE_VALF)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::lowest() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(-FLT_MAX)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::max() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_MAX)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::min() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(FLT_MIN)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::quiet_NaN() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(0x7FF80000)); - return eps.xf32; - } - - template <> - HIPTENSOR_HOST_DEVICE constexpr hiptensor::xfloat32_t - numeric_limits::signaling_NaN() noexcept - { - hiptensor::detail::Fp32Bits eps(static_cast(0x7FF00000)); - return eps.xf32; - } - // @endcond - +#endif // DOXYGEN_SHOULD_SKIP_THIS } // namespace std namespace hiptensor @@ -378,13 +315,6 @@ namespace hiptensor // b16 mantissa is 7 bits return ((int32_t)1 << 8); } - - template ::value, int> = 0> - constexpr auto maxExactInteger() -> int32_t - { - // xf32 mantissa is 7 bits - return ((int32_t)1 << 8); - } } // namespace hiptensor #endif // HIPTENSOR_TYPE_TRAITS_HPP diff --git a/library/src/include/types.hpp b/library/include/hiptensor/internal/types.hpp similarity index 100% rename from library/src/include/types.hpp rename to library/include/hiptensor/internal/types.hpp diff --git a/library/src/include/types_ext.hpp b/library/include/hiptensor/internal/types_ext.hpp similarity index 100% rename from library/src/include/types_ext.hpp rename to library/include/hiptensor/internal/types_ext.hpp diff --git a/library/src/contraction/contraction_cpu_reference.cpp b/library/src/contraction/contraction_cpu_reference.cpp index 13dcdffd..ac1d9711 100644 --- a/library/src/contraction/contraction_cpu_reference.cpp +++ b/library/src/contraction/contraction_cpu_reference.cpp @@ -28,31 +28,33 @@ #include "contraction_cpu_reference_impl.hpp" #include "contraction_cpu_reference_instances.hpp" -hiptensorStatus_t hiptensorContractionReference(void const* alpha, - void const* A, - void const* B, - void const* beta, - void const* C, - void* D, - std::vector const& a_ms_ks_lengths, - std::vector const& a_ms_ks_strides, - std::vector const& b_ns_ks_lengths, - std::vector const& b_ns_ks_strides, - std::vector const& c_ms_ns_lengths, - std::vector const& c_ms_ns_strides, - std::vector const& d_ms_ns_lengths, - std::vector const& d_ms_ns_strides, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - void* workspace) +hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan, + void const* alpha, + void const* A, + void const* B, + void const* beta, + void const* C, + void* D, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + std::vector const& c_ms_ns_lengths, + std::vector const& c_ms_ns_strides, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + void* workspace) { - auto& instances = hiptensor::ContractionCpuReferenceInstances::instance(); + auto& instances = hiptensor::ContractionCpuReferenceInstances::instance(); + auto computeType = plan->mContractionDesc.mComputeType; auto candidates - = (C == nullptr) - ? instances->allSolutions().query(typeA, typeB, hiptensor::NONE_TYPE, typeD) - : instances->allSolutions().query(typeA, typeB, typeC, typeD); + = (C == nullptr) ? instances->allSolutions().query( + typeA, typeB, hiptensor::NONE_TYPE, typeD, computeType) + : instances->allSolutions().query(typeA, typeB, typeC, typeD, computeType); auto toCKVec = [](auto& inputVec) { return std::vector(inputVec.begin(), inputVec.end()); }; diff --git a/library/src/contraction/contraction_cpu_reference.hpp b/library/src/contraction/contraction_cpu_reference.hpp index aadb062e..471026dc 100644 --- a/library/src/contraction/contraction_cpu_reference.hpp +++ b/library/src/contraction/contraction_cpu_reference.hpp @@ -32,24 +32,25 @@ #include -hiptensorStatus_t hiptensorContractionReference(void const* alpha, - void const* A, - void const* B, - void const* beta, - void const* C, - void* D, - std::vector const& a_ms_ks_lengths, - std::vector const& a_ms_ks_strides, - std::vector const& b_ks_ns_lengths, - std::vector const& b_ks_ns_strides, - std::vector const& c_ms_ns_lengths, - std::vector const& c_ms_ns_strides, - std::vector const& d_ms_ns_lengths, - std::vector const& d_ms_ns_strides, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - void* workspace); +hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan, + void const* alpha, + void const* A, + void const* B, + void const* beta, + void const* C, + void* D, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + std::vector const& b_ks_ns_lengths, + std::vector const& b_ks_ns_strides, + std::vector const& c_ms_ns_lengths, + std::vector const& c_ms_ns_strides, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + void* workspace); #endif // HIPTENSOR_CONTRACTION_CPU_REFERENCE_HPP diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index 673f6dff..2f031bb0 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -45,19 +45,25 @@ namespace hiptensor { // hardcoded for NumDimM == NumDimN == NumDimK == 2 + // + // ck::bhalf_t is ushort, cannot perform bhalf_t * bhalf_t + // CK does not use ck::bhalf_t as AccDataType. But we still + // add this guard here template < ck::index_t NumDimM, ck::index_t NumDimN, ck::index_t NumDimK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, - typename AccDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, - ck::enable_if_t, bool> = false> struct ReferenceContraction_M2_N2_K2 @@ -70,7 +76,8 @@ namespace hiptensor EDataType, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation> + CDEElementwiseOperation, + ComputeDataType> { using BaseArgument = ck::tensor_operation::device::BaseArgument; using BaseInvoker = ck::tensor_operation::device::BaseInvoker; @@ -149,57 +156,163 @@ namespace hiptensor indices.begin(), indices.end(), strides.begin(), std::size_t{0}); }; - auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { - auto accum = static_cast(0); - - auto K0 = arg.mA_ms_ks_lengths[2]; - auto K1 = arg.mA_ms_ks_lengths[3]; - - for(size_t k0 = 0; k0 < K0; k0++) - { - for(size_t k1 = 0; k1 < K1; k1++) + if constexpr((std::is_same_v && + std::is_same_v && + std::is_same_v) || + (std::is_same_v && + std::is_same_v && + std::is_same_v)) + { + auto f_ms_ns_complex = [&](auto m0, auto m1, auto n0, auto n1) { + HIP_vector_type accum{0}; + + auto K0 = arg.mA_ms_ks_lengths[2]; + auto K1 = arg.mA_ms_ks_lengths[3]; + + for(size_t k0 = 0; k0 < K0; k0++) + { + for(size_t k1 = 0; k1 < K1; k1++) + { + auto indexA + = offset(std::vector{m0, m1, k0, k1}, arg.mA_ms_ks_strides); + auto indexB + = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); + + ADataType valA = ((ADataType*)arg.mA)[indexA]; + BDataType valB = ((BDataType*)arg.mB)[indexB]; + + // Mult / accum + if constexpr(std::is_same_v) + { + accum = hipCaddf(accum, hipCmulf(valA, valB)); + } + else if constexpr(std::is_same_v) + { + accum = hipCadd(accum, hipCmul(valA, valB)); + } + } + } + + auto indexE = offset(std::vector{m0, m1, n0, n1}, arg.mE_ms_ns_strides); + + if constexpr(std::is_same_v) + { + ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.scale_ * (EDataType)accum; + } + else if constexpr(std::is_same_v) + { + if constexpr(std::is_same_v) + { + ((EDataType*)arg.mE)[indexE] = hipCmulf(hipComplexDoubleToFloat(arg.mOpCDE.scale_), (EDataType)accum); + } + else + { + ((EDataType*)arg.mE)[indexE] = hipCmul(arg.mOpCDE.scale_, (EDataType)accum); + } + } + else if constexpr(std::is_same_v) + { + // NumDTensor will be 1 due to SFINAE of this class + auto indexD + = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); + + ((EDataType*)arg.mE)[indexE] = arg.mOpCDE.alpha_ * (EDataType)accum + + arg.mOpCDE.beta_ * ((EDataType*)(arg.mD[0]))[indexD]; + } + else if constexpr(std::is_same_v) + { + // NumDTensor will be 1 due to SFINAE of this class + auto indexD + = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); + + if constexpr(std::is_same_v) + { + ((EDataType*)arg.mE)[indexE] = hipCaddf( + hipCmulf( + hipComplexDoubleToFloat(arg.mOpCDE.alpha_), + (EDataType)accum), + hipCmulf( + hipComplexDoubleToFloat(arg.mOpCDE.beta_), + ((EDataType*)(arg.mD[0]))[indexD])); + } + else + { + ((EDataType*)arg.mE)[indexE] = hipCadd(hipCmul(arg.mOpCDE.alpha_, (EDataType)accum), + hipCmul(arg.mOpCDE.beta_, ((EDataType*)(arg.mD[0]))[indexD])); + } + } + }; + + make_ParallelTensorFunctor(f_ms_ns_complex, + arg.mE_ms_ns_lengths[0], + arg.mE_ms_ns_lengths[1], + arg.mE_ms_ns_lengths[2], + arg.mE_ms_ns_lengths[3])( + std::thread::hardware_concurrency()); + } + else + { + auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { + AccDataType accum = 0; + + auto K0 = arg.mA_ms_ks_lengths[2]; + auto K1 = arg.mA_ms_ks_lengths[3]; + + for(size_t k0 = 0; k0 < K0; k0++) { - auto indexA - = offset(std::vector{m0, m1, k0, k1}, arg.mA_ms_ks_strides); - auto indexB - = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); - - ADataType valA; - BDataType valB; + for(size_t k1 = 0; k1 < K1; k1++) + { + auto indexA + = offset(std::vector{m0, m1, k0, k1}, arg.mA_ms_ks_strides); + auto indexB + = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); + + AccDataType valA; + AccDataType valB; + + // Element-wise ops + arg.mOpA( + valA, + ck::type_convert(((ADataType*)arg.mA)[indexA])); + arg.mOpB( + valB, + ck::type_convert(((BDataType*)arg.mB)[indexB])); + + // Mult / accum + accum += valA * valB; + } + } - // Element-wise ops - arg.mOpA(valA, ((ADataType*)arg.mA)[indexA]); - arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]); + auto indexE = offset(std::vector{m0, m1, n0, n1}, arg.mE_ms_ns_strides); - // Mult / accum - accum - += static_cast(valA) * static_cast(valB); + if constexpr(std::is_same_v) + { + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum)); } - } - - auto indexE = offset(std::vector{m0, m1, n0, n1}, arg.mE_ms_ns_strides); - - if constexpr(std::is_same_v) - { - arg.mOpCDE(((EDataType*)arg.mE)[indexE], accum); - } - else // bilinear - { - // NumDTensor will be 1 due to SFINAE of this class - auto indexD - = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); - arg.mOpCDE( - ((EDataType*)arg.mE)[indexE], accum, ((EDataType*)(arg.mD[0]))[indexD]); - } - }; + else // bilinear + { + // NumDTensor will be 1 due to SFINAE of this class + auto indexD + = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum), + ((EDataType*)(arg.mD[0]))[indexD]); + } + }; - make_ParallelTensorFunctor(f_ms_ns, - arg.mE_ms_ns_lengths[0], - arg.mE_ms_ns_lengths[1], - arg.mE_ms_ns_lengths[2], - arg.mE_ms_ns_lengths[3])( - std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(f_ms_ns, + arg.mE_ms_ns_lengths[0], + arg.mE_ms_ns_lengths[1], + arg.mE_ms_ns_lengths[2], + arg.mE_ms_ns_lengths[3])( + std::thread::hardware_concurrency()); + } return 0; } @@ -319,23 +432,25 @@ namespace hiptensor ck::index_t NumDimsK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, - typename AccumDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + CDEElementwiseOperation, + ComputeDataType>> : public MetaTraits< ck::tensor_operation::device::DeviceContractionMultipleD> + CDEElementwiseOperation, + ComputeDataType>> { }; @@ -355,11 +471,13 @@ namespace hiptensor ck::index_t NumDimK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType = ADataType> auto enumerateReferenceSolutions() { using ReferenceOp = ReferenceContraction_M2_N2_K2; + CDEElementwiseOperation, + ComputeDataType>; auto solution = std::make_unique>( std::make_unique()); diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 106dd5ff..60c1ce49 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -32,6 +32,36 @@ namespace hiptensor ContractionCpuReferenceInstances::ContractionCpuReferenceInstances() { // Register all the solutions exactly once + // Bilinear f16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + float, + ck::Tuple, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear bf16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + float, + ck::Tuple, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f32 registerSolutions( enumerateReferenceSolutions<2, @@ -39,11 +69,56 @@ namespace hiptensor 2, float, float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::half_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::bhalf_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + float, ck::Tuple, float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear complex f32 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + float, + ck::Tuple, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::BilinearComplex, + hipFloatComplex>()); // Bilinear f64 registerSolutions( @@ -52,11 +127,72 @@ namespace hiptensor 2, double, double, + float, + ck::Tuple, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + double, + double, + double, ck::Tuple, double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + double>()); + + // Bilinear complex f64 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + double, + ck::Tuple, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::BilinearComplex, + hipDoubleComplex>()); + + // Scale f16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + float, + ck::Tuple<>, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale bf16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + float, + ck::Tuple<>, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); // Scale f32 registerSolutions( @@ -65,11 +201,56 @@ namespace hiptensor 2, float, float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::half_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::bhalf_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + float, ck::Tuple<>, float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale complex f32 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + float, + ck::Tuple<>, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::ScaleComplex, + hipFloatComplex>()); // Scale f64 registerSolutions( @@ -78,10 +259,41 @@ namespace hiptensor 2, double, double, + float, + ck::Tuple<>, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + double, + double, + double, ck::Tuple<>, double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + double>()); + + // Scale complex f64 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + double, + ck::Tuple<>, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::ScaleComplex, + hipDoubleComplex>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index 4fa7acf7..48508c6e 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -34,12 +34,12 @@ #include // hiptensor includes +#include "device/device_element_wise_operation_complex.hpp" #include "data_types.hpp" #include "meta_traits.hpp" namespace hiptensor { - // Partial specialize for Bilinear contraction template + typename BElementwiseOperation, + typename CDEElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + CDEElementwiseOperation, + ComputeDataType>, + std::enable_if_t<(std::is_same_v) || + (std::is_same_v)>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; - using ADataT = ADataType; - using BDataT = BDataType; - using DDataT = DsDataType; - using EDataT = EDataType; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Bilinear; + /* + * CK does not use hip_bfloat16, instead it use ushort(ck::bhalf_t) for cuda bhalf_t type. + * What we want here is that we can use ck::bhalf_t with ck instances and use hip_bfloat16 + * with hiptensor classes. + * + * When creating a solution, ck::bhalf_t was passed in to create ck instance. + * When registering the solution, MetaTraits will returen hip_bfloat16 to create key. + */ + using ADataT + = std::conditional_t, hip_bfloat16, ADataType>; + using BDataT + = std::conditional_t, hip_bfloat16, BDataType>; + using DDataT + = std::conditional_t, hip_bfloat16, DsDataType>; + using EDataT + = std::conditional_t, hip_bfloat16, EDataType>; + using ComputeDataT = std::conditional_t, + hip_bfloat16, + ComputeDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = CDEElementwiseOperation; }; // Partial specialize for Scale contraction @@ -82,7 +104,9 @@ namespace hiptensor typename BDataType, typename EDataType, typename AElementwiseOperation, - typename BElementwiseOperation> + typename BElementwiseOperation, + typename CDEElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + CDEElementwiseOperation, + ComputeDataType>, + std::enable_if_t<(std::is_same_v) || + (std::is_same_v)>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; - using ADataT = ADataType; - using BDataT = BDataType; - using DDataT = NoneType; - using EDataT = EDataType; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Scale; + using ADataT + = std::conditional_t, hip_bfloat16, ADataType>; + using BDataT + = std::conditional_t, hip_bfloat16, BDataType>; + using DDataT = NoneType; + using EDataT + = std::conditional_t, hip_bfloat16, EDataType>; + using ComputeDataT = std::conditional_t, + hip_bfloat16, + ComputeDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = CDEElementwiseOperation; }; } // namespace hiptensor diff --git a/library/src/contraction/contraction_pack_util.hpp b/library/src/contraction/contraction_pack_util.hpp new file mode 100644 index 00000000..5032fa8a --- /dev/null +++ b/library/src/contraction/contraction_pack_util.hpp @@ -0,0 +1,140 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#ifndef HIPTENSOR_CONTRACTION_PACK_UTIL_HPP +#define HIPTENSOR_CONTRACTION_PACK_UTIL_HPP + +#include "data_types.hpp" +#include "util.hpp" +#include + +namespace hiptensor +{ + /** + * \brief This function performs multiply-accumulate of the form E = accum * alpha + D * beta + * + */ + template + __global__ void mfma(DataType* mE_real, DataType* mE_imag, DataType* mD_real, DataType* mD_imag, + HIP_vector_type *mE_grid, HIP_vector_type alpha, + HIP_vector_type beta, int length) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if(idx < length) + { + if constexpr(std::is_same_v) + { + mE_grid[idx] = hipCaddf( + hipCmulf( + make_hipFloatComplex(mE_real[idx], mE_imag[idx]), + hipComplexDoubleToFloat(alpha)), + hipCmulf( + make_hipFloatComplex(mD_real[idx], mD_imag[idx]), + hipComplexDoubleToFloat(beta))); + } + else if constexpr(std::is_same_v) + { + mE_grid[idx] = hipCadd(hipCmul( + make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), + alpha), + hipCmul( + make_hipDoubleComplex(mD_real[idx], mD_imag[idx]), + beta)); + } + } + } + + /** + * \brief This function performs multiply of the form C = accum * alpha + * + */ + template + __global__ void multiply(DataType* mE_real, DataType* mE_imag, HIP_vector_type *mE_grid, + HIP_vector_type alpha, int length) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if(idx < length) + { + if constexpr(std::is_same_v) + { + mE_grid[idx] = hipCmulf( + make_hipFloatComplex(mE_real[idx], mE_imag[idx]), + hipComplexDoubleToFloat(alpha)); + } + else if constexpr(std::is_same_v) + { + mE_grid[idx] = hipCmul( + make_hipDoubleComplex(mE_real[idx], mE_imag[idx]), + alpha); + } + } + } + + /** + * \brief This function unpacks structured data (hipFloatComplex / hipDoubleComplex) + * into non-structured data (float / double). + */ + template + __global__ void unpack(const InputType* in, OutputType* out_real, OutputType *out_img, int length) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + if(idx < length) + { + if constexpr(std::is_same_v) + { + out_real[idx] = hipCrealf(in[idx]); + out_img[idx] = hipCimagf(in[idx]); + } + else if constexpr(std::is_same_v) + { + out_real[idx] = hipCreal(in[idx]); + out_img[idx] = hipCimag(in[idx]); + } + } + } + + struct DeviceDeleter + { + void operator()(void* ptr) + { + CHECK_HIP_ERROR(hipFree(ptr)); + } + }; + + template + auto allocDevice(int64_t numElements) + { + T* data; + CHECK_HIP_ERROR(hipMalloc(&data, numElements * sizeof(T))); + return std::unique_ptr(data, DeviceDeleter()); + } + +} // namespace hiptensor + +#endif // HIPTENSOR_CONTRACTION_PACK_UTIL_HPP diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index aaa624f6..f96e8412 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -54,6 +54,7 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize) { // Make sure that we calculate full element space incase strides are not packed. @@ -71,8 +72,27 @@ namespace hiptensor * hipDataTypeSize(typeE); void *A_d, *B_d, *D_d, *E_d, *wspace; - float alpha = 1.02f; - float beta = 1.03f; + + /* + * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha. + * ``` + * alphaF = hiptensor::readVal( + * alpha, convertToComputeType(HipDataType_v)); + * ``` + * Hence, the `alpha` and `bete` need to point to a ComputeData value + */ + ScalarData alpha; + ScalarData beta; + if(computeType == HIPTENSOR_COMPUTE_C32F || computeType == HIPTENSOR_COMPUTE_C64F) + { + writeVal(&alpha, computeType, {computeType, 1.02, 1.03}); + writeVal(&beta, computeType, {computeType, 1.04, 1.05}); + } + else + { + writeVal(&alpha, computeType, ScalarData(computeType, 1.02)); + writeVal(&beta, computeType, ScalarData(computeType, 1.03)); + } CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA)); CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB)); @@ -151,7 +171,12 @@ namespace hiptensor } template <> - struct ActorCriticSelection + struct ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::SCALE, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -179,329 +204,55 @@ namespace hiptensor size_t unique_id = 0; - if(d6 <= 43) + unique_id = 11124293857315312720ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { - if(d5 <= 61) - { - if(d3 <= 236) - { - if(d4 <= 519) - { - if(d1 <= 744) - { - if(d6 <= 8) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 17304057348073251997ull; - } - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - if(d3 <= 32) - { - unique_id = 17304057348073251997ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - } - else - { - if(d6 <= 2) - { - if(d5 <= 15) - { - unique_id = 17618515137355245877ull; - } - else - { - if(d6 <= 1) - { - unique_id = 10830479759059230274ull; - } - else - { - if(d5 <= 32) - { - unique_id = 10830479759059230274ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - } - } - else - { - if(d5 <= 2) - { - if(d6 <= 8) - { - unique_id = 17618515137355245877ull; - } - else - { - unique_id = 10830479759059230274ull; - } - } - else - { - if(d1 <= 54) - { - unique_id = 17304057348073251997ull; - } - else - { - if(d4 <= 218) - { - if(d5 <= 36) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d6 <= 31) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - else - { - if(d2 <= 50) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d6 <= 31) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d6 <= 32) - { - unique_id = 10830479759059230274ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - } - } - } - } - } - } - } - else - { - if(d6 <= 18) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d4 <= 557) - { - if(d2 <= 165) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - else - { - if(d5 <= 68) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - } + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; } else { - if(d5 <= 24) - { - if(d3 <= 435) - { - if(d5 <= 7) - { - if(d5 <= 1) - { - unique_id = 3454820663416883703ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - if(d1 <= 744) - { - unique_id = 17304057348073251997ull; - } - else - { - if(d6 <= 60) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 17304057348073251997ull; - } - } - } - } - else - { - if(d5 <= 1) - { - unique_id = 3454820663416883703ull; - } - else - { - if(d5 <= 13) - { - if(d5 <= 7) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - if(d6 <= 58) - { - unique_id = 4671301146928673150ull; - } - else - { - if(d1 <= 642) - { - unique_id = 17304057348073251997ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - } - } - } - else - { - if(d6 <= 54) - { - if(d5 <= 37) - { - if(d4 <= 556) - { - unique_id = 16481146763982821264ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - if(d1 <= 222) - { - if(d4 <= 556) - { - unique_id = 16481146763982821264ull; - } - else - { - unique_id = 4671301146928673150ull; - } - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - else - { - if(d4 <= 44) - { - if(d3 <= 436) - { - unique_id = 17304057348073251997ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - else - { - if(d1 <= 220) - { - if(d2 <= 107) - { - unique_id = 17304057348073251997ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - else - { - if(d3 <= 72) - { - unique_id = 16481146763982821264ull; - } - else - { - if(d2 <= 18) - { - unique_id = 4671301146928673150ull; - } - else - { - unique_id = 16481146763982821264ull; - } - } - } - } - } - } + return HIPTENSOR_STATUS_EXECUTION_FAILED; } + } + }; + + template <> + struct ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::BILINEAR, + float> + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 1953020431947874122ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -516,7 +267,12 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -544,322 +300,55 @@ namespace hiptensor size_t unique_id = 0; - if(d6 <= 9) + unique_id = 14895098881714635802ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { - if(d6 <= 4) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d5 <= 16) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d2 <= 196) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d1 <= 113) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d3 <= 219) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - } - } + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; } else { - if(d5 <= 8) - { - if(d6 <= 28) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d5 <= 2) - { - if(d6 <= 58) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d5 <= 1) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - else - { - if(d2 <= 163) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d1 <= 465) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - } - } - else - { - if(d3 <= 121) - { - if(d4 <= 483) - { - if(d6 <= 29) - { - if(d5 <= 32) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 222393107113976106ull; - } - } - else - { - if(d5 <= 39) - { - unique_id = 222393107113976106ull; - } - else - { - if(d2 <= 152) - { - unique_id = 222393107113976106ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - } - else - { - if(d3 <= 37) - { - unique_id = 222393107113976106ull; - } - else - { - if(d6 <= 29) - { - if(d5 <= 32) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - else - { - if(d4 <= 135) - { - if(d3 <= 413) - { - if(d6 <= 30) - { - if(d5 <= 32) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 222393107113976106ull; - } - } - else - { - if(d5 <= 39) - { - unique_id = 222393107113976106ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - } - else - { - if(d4 <= 36) - { - unique_id = 222393107113976106ull; - } - else - { - if(d2 <= 120) - { - unique_id = 222393107113976106ull; - } - else - { - if(d6 <= 32) - { - if(d5 <= 32) - { - unique_id = 13257779901106960809ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - } - else - { - if(d2 <= 115) - { - if(d6 <= 40) - { - if(d2 <= 51) - { - unique_id = 222393107113976106ull; - } - else - { - if(d5 <= 32) - { - unique_id = 9622108777680582053ull; - } - else - { - if(d4 <= 486) - { - unique_id = 222393107113976106ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - else - { - if(d1 <= 235) - { - unique_id = 222393107113976106ull; - } - else - { - if(d2 <= 22) - { - unique_id = 222393107113976106ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - else - { - if(d6 <= 32) - { - if(d5 <= 26) - { - if(d6 <= 23) - { - if(d1 <= 116) - { - unique_id = 9622108777680582053ull; - } - else - { - unique_id = 13257779901106960809ull; - } - } - else - { - if(d5 <= 18) - { - unique_id = 13257779901106960809ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - else - { - if(d5 <= 32) - { - if(d6 <= 16) - { - unique_id = 13257779901106960809ull; - } - else - { - unique_id = 15066925687960442338ull; - } - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - else - { - unique_id = 15066925687960442338ull; - } - } - } - } - } + return HIPTENSOR_STATUS_EXECUTION_FAILED; } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 8517235228581081946ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -874,7 +363,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -893,7 +382,6 @@ namespace hiptensor std::vector const& e_ms_ns_strides, const uint64_t workspaceSize) { - int d1 = a_ms_ks_lengths[0]; int d2 = a_ms_ks_lengths[1]; int d3 = b_ns_ks_lengths[0]; @@ -903,238 +391,50 @@ namespace hiptensor size_t unique_id = 0; - if(d5 <= 36) + unique_id = 17313709378682913599ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { - if(d6 <= 35) - { - if(d1 <= 763) - { - if(d6 <= 3) - { - if(d5 <= 8) - { - unique_id = 9769367948782541618ull; - } - else - { - unique_id = 3344638327382374968ull; - } - } - else - { - unique_id = 3344638327382374968ull; - } - } - else - { - if(d6 <= 24) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d5 <= 17) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - } - } - else - { - if(d5 <= 9) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d1 <= 759) - { - if(d6 <= 67) - { - if(d3 <= 535) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d4 <= 615) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - } - else - { - if(d5 <= 25) - { - if(d4 <= 428) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - unique_id = 16588612317409292216ull; - } - } - } - else - { - if(d6 <= 64) - { - if(d3 <= 65) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - if(d5 <= 25) - { - unique_id = 2770278462698889442ull; - } - else - { - unique_id = 16588612317409292216ull; - } - } - } - } - } + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; } else { - if(d6 <= 33) - { - if(d6 <= 8) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d2 <= 565) - { - if(d1 <= 646) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d6 <= 27) - { - unique_id = 3344638327382374968ull; - } - else - { - if(d5 <= 53) - { - unique_id = 2770278462698889442ull; - } - else - { - unique_id = 16588612317409292216ull; - } - } - } - } - else - { - if(d6 <= 20) - { - if(d3 <= 168) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - if(d5 <= 64) - { - if(d1 <= 648) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - if(d6 <= 25) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 16588612317409292216ull; - } - } - } - } - } - } - else - { - if(d5 <= 45) - { - if(d6 <= 50) - { - if(d3 <= 168) - { - unique_id = 3344638327382374968ull; - } - else - { - unique_id = 2770278462698889442ull; - } - } - else - { - unique_id = 16588612317409292216ull; - } - } - else - { - if(d6 <= 43) - { - if(d5 <= 52) - { - unique_id = 2770278462698889442ull; - } - else - { - unique_id = 16588612317409292216ull; - } - } - else - { - unique_id = 16588612317409292216ull; - } - } - } + return HIPTENSOR_STATUS_EXECUTION_FAILED; } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 14397647188602189900ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1149,7 +449,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1177,217 +477,55 @@ namespace hiptensor size_t unique_id = 0; - if(d5 <= 39) + unique_id = 8339198051871565944ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { - if(d3 <= 937) - { - if(d6 <= 1) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d4 <= 754) - { - if(d5 <= 33) - { - if(d5 <= 1) - { - if(d6 <= 25) - { - unique_id = 3423207643344265161ull; - } - else - { - unique_id = 1830537384143755749ull; - } - } - else - { - if(d6 <= 6) - { - if(d5 <= 8) - { - unique_id = 3423207643344265161ull; - } - else - { - unique_id = 1830537384143755749ull; - } - } - else - { - unique_id = 1830537384143755749ull; - } - } - } - else - { - unique_id = 1830537384143755749ull; - } - } - else - { - if(d1 <= 404) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d6 <= 50) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d5 <= 33) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - } - } - } - else - { - unique_id = 1830537384143755749ull; - } + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; } else { - if(d6 <= 32) - { - if(d2 <= 832) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d6 <= 8) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d6 <= 24) - { - unique_id = 17689908062647780665ull; - } - else - { - if(d5 <= 64) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - } - } - else - { - if(d6 <= 46) - { - if(d5 <= 54) - { - if(d1 <= 460) - { - unique_id = 1830537384143755749ull; - } - else - { - if(d5 <= 49) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - else - { - if(d1 <= 182) - { - if(d5 <= 65) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - if(d2 <= 33) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - } - else - { - if(d5 <= 49) - { - if(d6 <= 64) - { - if(d1 <= 411) - { - if(d2 <= 396) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - if(d2 <= 53) - { - if(d1 <= 222) - { - unique_id = 1830537384143755749ull; - } - else - { - unique_id = 4992687403741300893ull; - } - } - else - { - unique_id = 4992687403741300893ull; - } - } - } - } + return HIPTENSOR_STATUS_EXECUTION_FAILED; } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 2724417728984064737ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1401,8 +539,11 @@ namespace hiptensor } }; - hiptensorStatus_t - actorCriticModel(ContractionSolution** winner, + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, std::unordered_map const& candidates, hipDataType typeA, std::vector const& a_ms_ks_lengths, @@ -1417,88 +558,889 @@ namespace hiptensor std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, const uint64_t workspaceSize) - { - if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F) - { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); - } - else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F - && typeE == HIP_R_32F) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 5943247903036531691ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } } - else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE - && typeE == HIP_R_64F) + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 17972447156160297755ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } } - else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F - && typeE == HIP_R_64F) - { - return ActorCriticSelection::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 3893144338697524749ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + unique_id = 15165261158317928321ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 14511729289005214097ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 3636246152928348445ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 5711776907278244209ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 355777364055884033ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 3085227716611397774ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + unique_id = 2196983681630807584ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + hiptensorStatus_t + actorCriticModel(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, + const uint64_t workspaceSize) + { + if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F + && computeType == HIPTENSOR_COMPUTE_32F) + { + return ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::SCALE, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F + && computeType == HIPTENSOR_COMPUTE_32F) + { + return ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::BILINEAR, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE + && typeE == HIP_R_16BF && computeType == HIPTENSOR_COMPUTE_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF + && typeE == HIP_R_16BF && computeType == HIPTENSOR_COMPUTE_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIPTENSOR_COMPUTE_16F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIPTENSOR_COMPUTE_16F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIPTENSOR_COMPUTE_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIPTENSOR_COMPUTE_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F + && computeType == HIPTENSOR_COMPUTE_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F + && computeType == HIPTENSOR_COMPUTE_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F + && computeType == HIPTENSOR_COMPUTE_64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F + && computeType == HIPTENSOR_COMPUTE_64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_C_32F && typeB == HIP_C_32F && typeD == NONE_TYPE && typeE == HIP_C_32F + && computeType == HIPTENSOR_COMPUTE_C32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_C_32F && typeB == HIP_C_32F && typeD == HIP_C_32F && typeE == HIP_C_32F + && computeType == HIPTENSOR_COMPUTE_C32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_C_64F && typeB == HIP_C_64F && typeD == NONE_TYPE && typeE == HIP_C_64F + && computeType == HIPTENSOR_COMPUTE_C64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_C_64F && typeB == HIP_C_64F && typeD == HIP_C_64F && typeE == HIP_C_64F + && computeType == HIPTENSOR_COMPUTE_C64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } return HIPTENSOR_STATUS_EXECUTION_FAILED; } diff --git a/library/src/contraction/contraction_selection.hpp b/library/src/contraction/contraction_selection.hpp index 9ceb6a14..deb980d9 100644 --- a/library/src/contraction/contraction_selection.hpp +++ b/library/src/contraction/contraction_selection.hpp @@ -49,9 +49,15 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize); - template + template struct ActorCriticSelection { static hiptensorStatus_t @@ -87,6 +93,7 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize); } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp index 0037584e..97dde1ca 100644 --- a/library/src/contraction/contraction_solution.hpp +++ b/library/src/contraction/contraction_solution.hpp @@ -38,6 +38,8 @@ #include #include +#include "device/device_element_wise_operation_complex.hpp" + #include "contraction_meta_traits.hpp" #include "contraction_solution_params.hpp" #include "performance.hpp" @@ -147,7 +149,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType> std::vector> enumerateContractionSolutions(); } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 0fb5df9d..09e300a7 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -35,11 +35,11 @@ namespace std { template <> - struct std::hash + struct hash { - std::size_t operator()(hiptensor::ContractionSolution const& s) const noexcept + size_t operator()(hiptensor::ContractionSolution const& s) const noexcept { - return std::hash{}(*s.params()); + return hash{}(*s.params()); } }; } @@ -52,8 +52,10 @@ namespace hiptensor template class ContractionSolutionImpl< DeviceOp, - std::enable_if_t::CDEOp, - ck::tensor_operation::element_wise::Bilinear>>> + std::enable_if_t<(std::is_same_v::CDEOp, + ck::tensor_operation::element_wise::Bilinear>) + || (std::is_same_v::CDEOp, + ck::tensor_operation::element_wise::BilinearComplex>)>> : public ContractionSolution { public: @@ -90,16 +92,18 @@ namespace hiptensor auto* deviceOp = dynamic_cast(Base::mDeviceOp.get()); // Note: CK ALWAYS uses float for alpha / beta in contraction multipleD - auto alphaF = 0.0f; - auto betaF = 0.0f; + ScalarData alphaF; + ScalarData betaF; if(alpha != nullptr) { - alphaF = hiptensor::readVal(alpha, HipDataType_v); + alphaF = hiptensor::readVal( + alpha, convertToComputeType(HipDataType_v)); } if(beta != nullptr) { - betaF = hiptensor::readVal(beta, HipDataType_v); + betaF = hiptensor::readVal( + beta, convertToComputeType(HipDataType_v)); } // CK has its own format for indices... @@ -123,7 +127,7 @@ namespace hiptensor toCKVec(e_ms_ns_strides), typename Traits::AOp{}, typename Traits::BOp{}, - typename Traits::CDEOp{alphaF, betaF})); + typename Traits::CDEOp(alphaF, betaF))); // Attach the workspace pointer deviceOp->SetWorkSpacePointer(Base::mArgPtr.get(), workspacePtr); @@ -163,8 +167,10 @@ namespace hiptensor template class ContractionSolutionImpl< DeviceOp, - std::enable_if_t::CDEOp, - ck::tensor_operation::element_wise::Scale>>> + std::enable_if_t<(std::is_same_v::CDEOp, + ck::tensor_operation::element_wise::Scale>) + || (std::is_same_v::CDEOp, + ck::tensor_operation::element_wise::ScaleComplex>)>> : public ContractionSolution { public: @@ -201,11 +207,12 @@ namespace hiptensor auto* deviceOp = dynamic_cast(Base::mDeviceOp.get()); // Note: CK ALWAYS uses float for alpha / beta in contraction multipleD - auto alphaF = 0.0f; + ScalarData alphaF; if(alpha != nullptr) { - alphaF = hiptensor::readVal(alpha, HipDataType_v); + alphaF = hiptensor::readVal( + alpha, convertToComputeType(HipDataType_v)); } // CK has its own format for indices... @@ -229,7 +236,7 @@ namespace hiptensor toCKVec(e_ms_ns_strides), typename Traits::AOp{}, typename Traits::BOp{}, - typename Traits::CDEOp{alphaF})); + typename Traits::CDEOp(alphaF))); // Attach the workspace pointer deviceOp->SetWorkSpacePointer(Base::mArgPtr.get(), workspacePtr); @@ -274,7 +281,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType = ADataType> std::vector> enumerateContractionSolutions() { using ContractionOp @@ -287,7 +295,8 @@ namespace hiptensor EDataType, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation>; + CDEElementwiseOperation, + ComputeDataType>; using Factory = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory; diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index fd263a8b..ad5b4408 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -27,11 +27,44 @@ #include "contraction_solution_instances.hpp" #include "contraction_solution.hpp" +// Ensure access to +#include "device/hiptensor_contraction_bilinear_instances.hpp" +#include "device/hiptensor_contraction_scale_instances.hpp" + namespace hiptensor { ContractionSolutionInstances::ContractionSolutionInstances() { // Register all the solutions exactly once + + // Bilinear bf16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear f16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + ck::Tuple, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f32 registerSolutions( enumerateContractionSolutions<2, @@ -43,7 +76,48 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::half_t>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::bhalf_t>()); + + // Bilinear complex f32 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + ck::Tuple, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::BilinearComplex, + hipFloatComplex>()); // Bilinear f64 registerSolutions( @@ -56,7 +130,62 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + double, + double, + ck::Tuple, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + double>()); + + // Bilinear complex f64 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + ck::Tuple, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::BilinearComplex, + hipDoubleComplex>()); + + // Scale bf16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple<>, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale f16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + ck::Tuple<>, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); // Scale f32 registerSolutions( @@ -69,7 +198,48 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::half_t>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::bhalf_t>()); + + // scale complex f32 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + hipFloatComplex, + hipFloatComplex, + ck::Tuple<>, + hipFloatComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::ScaleComplex, + hipFloatComplex>()); // Scale f64 registerSolutions( @@ -82,6 +252,34 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + double, + double, + ck::Tuple<>, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + double>()); + // scale complex f64 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + hipDoubleComplex, + hipDoubleComplex, + ck::Tuple<>, + hipDoubleComplex, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::ScaleComplex, + hipDoubleComplex>()); + } } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp index ec9de45c..4c44de88 100644 --- a/library/src/contraction/contraction_solution_params.hpp +++ b/library/src/contraction/contraction_solution_params.hpp @@ -49,10 +49,11 @@ namespace hiptensor virtual int32_t dimsK() const = 0; // Map to hipDataType - virtual hipDataType typeA() const = 0; - virtual hipDataType typeB() const = 0; - virtual hipDataType typeC() const = 0; - virtual hipDataType typeD() const = 0; + virtual hipDataType typeA() const = 0; + virtual hipDataType typeB() const = 0; + virtual hipDataType typeC() const = 0; + virtual hipDataType typeD() const = 0; + virtual hiptensorComputeType_t typeCompute() const = 0; // Map to operators virtual hiptensorOperator_t opA() const = 0; diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp index bff33960..3abcaede 100644 --- a/library/src/contraction/contraction_solution_params_impl.hpp +++ b/library/src/contraction/contraction_solution_params_impl.hpp @@ -35,13 +35,14 @@ namespace std { template <> - struct std::hash + struct hash { - std::size_t operator()(hiptensor::ContractionSolutionParams const& s) const noexcept + size_t operator()(hiptensor::ContractionSolutionParams const& s) const noexcept { return hiptensor::Hash{}(s.dimsM(), s.dimsN(), s.dimsK(), + s.typeCompute(), s.typeA(), s.typeB(), s.typeC(), @@ -102,6 +103,11 @@ namespace hiptensor return HipDataType_v; } + hiptensorComputeType_t typeCompute() const override + { + return convertToComputeType(HipDataType_v); + } + hiptensorOperator_t opA() const override { return ElementWiseOperatorType_v; diff --git a/library/src/contraction/contraction_solution_registry.cpp b/library/src/contraction/contraction_solution_registry.cpp index 83674c81..9e2da1f9 100644 --- a/library/src/contraction/contraction_solution_registry.cpp +++ b/library/src/contraction/contraction_solution_registry.cpp @@ -53,19 +53,20 @@ namespace hiptensor } ContractionSolutionRegistry::Query - ContractionSolutionRegistry::Query::query(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) const + ContractionSolutionRegistry::Query::query(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) const { - auto solutionHash - = hashSolution(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE); + auto solutionHash = hashSolution( + dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute); if(auto solutions = mSolutionHash.find(solutionHash); solutions != mSolutionHash.end()) { @@ -81,10 +82,14 @@ namespace hiptensor return query(hashDimsMNK(dimsM, dimsN, dimsK)); } - ContractionSolutionRegistry::Query ContractionSolutionRegistry::Query::query( - hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) const + ContractionSolutionRegistry::Query + ContractionSolutionRegistry::Query::query(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) const { - return query(hashTypesABCD(typeA, typeB, typeC, typeD)); + return query(hashTypesComputeABCD(typeA, typeB, typeC, typeD, typeCompute)); } ContractionSolutionRegistry::Query @@ -159,18 +164,20 @@ namespace hiptensor /* static */ ContractionSolutionRegistry::Query::HashId - ContractionSolutionRegistry::Query::hashSolution(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) + ContractionSolutionRegistry::Query::hashSolution(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) { - return Hash{}(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE); + return Hash{}( + dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute); } /* static */ @@ -181,10 +188,14 @@ namespace hiptensor } /* static */ - ContractionSolutionRegistry::Query::HashId ContractionSolutionRegistry::Query::hashTypesABCD( - hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) + ContractionSolutionRegistry::Query::HashId + ContractionSolutionRegistry::Query::hashTypesComputeABCD(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) { - return Hash{}(typeA, typeB, typeC, typeD); + return Hash{}(typeA, typeB, typeC, typeD, typeCompute); } /* static */ @@ -220,12 +231,16 @@ namespace hiptensor params->typeD(), params->opA(), params->opB(), - params->opCDE()); + params->opCDE(), + params->typeCompute()); auto dimsMNKHash = hashDimsMNK(params->dimsM(), params->dimsN(), params->dimsK()); - auto typesABCDHash - = hashTypesABCD(params->typeA(), params->typeB(), params->typeC(), params->typeD()); + auto typesComputeABCDHash = hashTypesComputeABCD(params->typeA(), + params->typeB(), + params->typeC(), + params->typeD(), + params->typeCompute()); auto elementOpsHash = hashElementOps(params->opA(), params->opB()); @@ -236,7 +251,7 @@ namespace hiptensor mAllSolutions[solutionUid] = solution; mSolutionHash[solutionHash].push_back(solution); mSolutionHash[dimsMNKHash].push_back(solution); - mSolutionHash[typesABCDHash].push_back(solution); + mSolutionHash[typesComputeABCDHash].push_back(solution); mSolutionHash[elementOpsHash].push_back(solution); mSolutionHash[contactionOpsHash].push_back(solution); } diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp index d1b80ec5..44aaa97d 100644 --- a/library/src/contraction/contraction_solution_registry.hpp +++ b/library/src/contraction/contraction_solution_registry.hpp @@ -59,25 +59,27 @@ namespace hiptensor /// E.g. in this context, query further parameters. // By full solution type - Query query(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) const; + Query query(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) const; // By dimensions Query query(int32_t dimsM, int32_t dimsN, int32_t dimsK) const; // By data types - Query query(hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD) const; + Query query(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) const; // By element-wise operations Query query(hiptensorOperator_t opA, hiptensorOperator_t opB) const; @@ -104,22 +106,24 @@ namespace hiptensor Query query(HashId queryHash) const; // Hashing helpers - static HashId hashSolution(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE); + static HashId hashSolution(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute); static HashId hashDimsMNK(int32_t dimsM, int32_t dimsN, int32_t dimsK); - static HashId hashTypesABCD(hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD); + static HashId hashTypesComputeABCD(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute); static HashId hashElementOps(hiptensorOperator_t opA, hiptensorOperator_t opB); static HashId hashContractionOps(ContractionOpId_t opCDE); diff --git a/library/src/contraction/contraction_types.hpp b/library/src/contraction/contraction_types.hpp index 101d72dc..e4930726 100644 --- a/library/src/contraction/contraction_types.hpp +++ b/library/src/contraction/contraction_types.hpp @@ -40,6 +40,8 @@ namespace hiptensor { SCALE = 0, ///< \f${C=\alpha\mathcal{A}\mathcal{B}}\f$ BILINEAR = 1, ///< \f${D=\alpha\mathcal{A}\mathcal{B}+\beta\mathcal{C}}\f$ + SCALE_COMPLEX = 2, + BILINEAR_COMPLEX = 3, UNKNOWN, }; diff --git a/library/src/contraction/contraction_types_impl.hpp b/library/src/contraction/contraction_types_impl.hpp index d8fa0f74..070718cc 100644 --- a/library/src/contraction/contraction_types_impl.hpp +++ b/library/src/contraction/contraction_types_impl.hpp @@ -32,6 +32,7 @@ #include #include +#include "device/device_element_wise_operation_complex.hpp" #include "contraction_types.hpp" #include @@ -51,12 +52,24 @@ namespace hiptensor static constexpr auto value = ContractionOpId_t::SCALE; }; + template <> + struct ContractionOperatorType + { + static constexpr auto value = ContractionOpId_t::SCALE_COMPLEX; + }; + template <> struct ContractionOperatorType { static constexpr auto value = ContractionOpId_t::BILINEAR; }; + template <> + struct ContractionOperatorType + { + static constexpr auto value = ContractionOpId_t::BILINEAR_COMPLEX; + }; + } // namespace hiptensor #endif // HIPTENSOR_CONTRACTION_TYPES_IMPL_HPP diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index f2e4a0fb..b65a8ab1 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -24,24 +24,80 @@ # ############################################################################### -set(CK_CONTRACTION_INSTANCE_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp -) + set(CK_CONTRACTION_INSTANCE_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp + ) add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) diff --git a/library/src/contraction/device/common.hpp b/library/src/contraction/device/common.hpp index f530b2e2..efd4866c 100644 --- a/library/src/contraction/device/common.hpp +++ b/library/src/contraction/device/common.hpp @@ -39,4 +39,6 @@ #include #include +#include "device_element_wise_operation_complex.hpp" + #endif // CONTRACTION_DEVICE_COMMON_HPP diff --git a/library/src/contraction/device/device_contraction_bilinear_complex.hpp b/library/src/contraction/device/device_contraction_bilinear_complex.hpp new file mode 100644 index 00000000..307ecb1c --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_complex.hpp @@ -0,0 +1,718 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#ifndef HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP +#define HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP + +#include "../contraction_pack_util.hpp" +#include "common.hpp" +#include + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + + using hiptensor::allocDevice; + using hiptensor::ceilDiv; + using hiptensor::DeviceDeleter; + using hiptensor::elementSpaceFromLengthsAndStrides; + + using Bilinear = ck::tensor_operation::element_wise::Bilinear; + using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex; + using Scale = ck::tensor_operation::element_wise::Scale; + using ScaleComplex = ck::tensor_operation::element_wise::ScaleComplex; + + // The following is a specialization class for bilinear contractions of complex types. + // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of + // the complex element type. + // The class implements a CK interface to wrap the 4 individual contraction operations and argument + // handling internally. + // Note: We are assuming that the data comes in as an Array of Structures (AOS) format in complex pairs. + // The argument initialization portion decomposes this data into structure of arrays (SOA) where the + // real and complex elements can be operated on separately. + + // Tensor Contraction: + // input : A + // input : B + // input : D0, D1, ... + // output : E + // C = a_op(A) * b_op(B) + // E = cde_op(C, D0, D1, ...) + // Assume: + // A[M0, M1, M2, ..., K0, K1, K2, ...] + // B[N0, N1, N2, ..., K0, K1, K2, ...] + // D[M0, M1, M2, ..., N0, N1, N2, ...] + // E[M0, M1, M2, ..., N0, N1, N2, ...] + template + struct DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + HIP_vector_type, + HIP_vector_type, + AccDataType, + CShuffleDataType, + ck::Tuple>, + HIP_vector_type, + AElementwiseOperation, + BElementwiseOperation, + BilinearComplex, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + HIP_vector_type, + LoopSched> + + : public DeviceContractionMultipleD, + HIP_vector_type, + ck::Tuple>, + HIP_vector_type, + AElementwiseOperation, + BElementwiseOperation, + BilinearComplex, + HIP_vector_type> + { + // Complex device Op + using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; + + // CDE Operations + using ScaleCDEElementwiseOperation = ScaleComplex; + using DecompScaleCDEElementwiseOperation = Scale; + using BilinearCDEElementwiseOperation = BilinearComplex; + using DecompBilinearCDEElementwiseOperation = Bilinear; + + // Complex types given through the interface + using ComplexA = HIP_vector_type; + using ComplexB = HIP_vector_type; + using ComplexDs = HIP_vector_type; + using ComplexE = HIP_vector_type; + using ComplexCompute = HIP_vector_type; + + // Internal functional types we will use to + // decompose the complex types and operate on. + using DecompA = ADataType; + using DecompB = BDataType; + using DecompDs = DsDataType; + using DecompE = EDataType; + using DecompCompute = ComputeDataType; + + // For complex types, we need to make sure that all of the types are the same + static_assert(std::is_same_v && std::is_same_v + && std::is_same_v + && std::is_same_v + && std::is_same_v, + "Complex operations must have the same data type"); + + static_assert(std::is_same_v || std::is_same_v, + "Complex operations only supported with single or double precision"); + + static constexpr index_t NumDTensor = 1; + + // The internal operation that we will decompose the complex operations with. + // For complex will be either float or double + using ScaleDecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + DecompA, + DecompB, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + DecompE, + AElementwiseOperation, + BElementwiseOperation, + DecompScaleCDEElementwiseOperation, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + DecompCompute, + LoopSched>; + + // The internal operation that we will decompose the complex operations with. + // For complex will be either float or double + using BilinearDecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + DecompA, + DecompB, + AccDataType, + CShuffleDataType, + ck::Tuple, + DecompE, + AElementwiseOperation, + BElementwiseOperation, + DecompBilinearCDEElementwiseOperation, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + DecompCompute, + LoopSched>; + + // Argument + struct Argument : public BaseArgument + { + using ScaleDecompArgument = typename ScaleDecompOp::Argument; + using BilinearDecompArgument = typename BilinearDecompOp::Argument; + + Argument(Argument&& other) + : mScaleArgs( + {std::move(other.mScaleArgs[0]), std::move(other.mScaleArgs[1])}) + , mBilinearArgs({std::move(other.mBilinearArgs[0]), + std::move(other.mBilinearArgs[1])}) + { + } + + Argument& operator=(Argument&& other) + { + if(this != &other) + { + mScaleArgs[0] = std::move(other.mScaleArgs[0]); + mScaleArgs[1] = std::move(other.mScaleArgs[1]); + mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); + mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); + } + return *this; + } + + Argument(const void* p_a_grid, + const void* p_b_grid, + std::array p_ds_grid, + void* p_e_grid, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + BilinearCDEElementwiseOperation cde_element_op) + : element_op(cde_element_op) + { + // Take the incoming arguments, treat them as complex. + + // Allocate Real and Imaginary inputs + auto elementsA + = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides); + auto elementsB + = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides); + auto elementsD = elementSpaceFromLengthsAndStrides(ds_ms_ns_lengths[0], + ds_ms_ns_strides[0]); + elementsE + = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); + + mA_real.reset(nullptr); + mA_imag.reset(nullptr); + mB_real.reset(nullptr); + mB_imag.reset(nullptr); + mD_real.reset(nullptr); + mD_imag.reset(nullptr); + mE_real.reset(nullptr); + mE_imag.reset(nullptr); + + mE_grid = p_e_grid; + auto blockDim = dim3(1024); + + auto decompGrid = [blockDim](auto& out_r, + auto& out_i, + auto const* input_grid, + uint32_t elementCount) { + using DecompT = typename std::decay_t::element_type; + static_assert(std::is_same_v< + DecompT, + typename std::decay_t::element_type>, + "r and i buffers must be same type"); + + if(input_grid != nullptr) + { + out_r = std::move(allocDevice(elementCount)); + out_i = std::move(allocDevice(elementCount)); + + auto gridDim = dim3(ceilDiv(elementCount, blockDim.x)); + hiptensor::unpack<<>>( + input_grid, out_r.get(), out_i.get(), elementCount); + } + }; + + // Decompose the incoming data from AOS->SOA + decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA); + decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB); + decompGrid(mD_real, mD_imag, (const ComplexDs*)p_ds_grid[0], elementsD); + decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE); + + auto allocScaleArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& cde_element_op) { + return std::make_unique( + in_a.get(), + in_b.get(), + std::array{}, + out_e.get(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 0>{}, + std::array, 0>{}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + }; + + auto allocBilinearArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& in_d, + auto const& cde_element_op) { + return std::make_unique( + in_a.get(), + in_b.get(), + std::array{in_d.get()}, + out_e.get(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{e_ms_ns_lengths}, + std::array, 1>{e_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + }; + + mScaleArgs[0] = allocScaleArgs( + mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[0] + = allocBilinearArgs(mE_real, + mA_imag, + mB_imag, + mE_real, + DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f}); + + mScaleArgs[1] = allocScaleArgs( + mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[1] + = allocBilinearArgs(mE_imag, + mA_imag, + mB_real, + mE_imag, + DecompBilinearCDEElementwiseOperation{1.0f, 1.0f}); + } + + void Print() const + { + std::cout << "ScaleArgs0:" << std::endl; + mScaleArgs[0]->Print(); + std::cout << "ScaleArgs1:" << std::endl; + mScaleArgs[1]->Print(); + std::cout << "BilinearArgs0:" << std::endl; + mBilinearArgs[0]->Print(); + std::cout << "BilinearArgs1:" << std::endl; + mBilinearArgs[1]->Print(); + } + + // private: + // Each argument set for complex: + std::unique_ptr mScaleArgs[2]; + std::unique_ptr mBilinearArgs[2]; + + template + using DeviceArray = std::unique_ptr; + + // Manage extra memory for AOS->SOA + DeviceArray mA_real; + DeviceArray mA_imag; + DeviceArray mB_real; + DeviceArray mB_imag; + DeviceArray mD_real; + DeviceArray mD_imag; + DeviceArray mE_real; + DeviceArray mE_imag; + + BilinearCDEElementwiseOperation element_op; + void* mE_grid; + index_t elementsE; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = typename DeviceOp::Argument; + + Invoker() + : mScaleInvoker(std::make_unique()) + , mBilinearInvoker(std::make_unique()) + { + } + + Invoker(Invoker&& other) + : mScaleInvoker(std::move(other.mScaleInvoker)) + , mBilinearInvoker(std::move(other.mBilinearInvoker)) + { + } + + Invoker& operator=(Invoker&& other) + { + if(this != &other) + { + mScaleInvoker = std::move(other.mScaleInvoker); + mBilinearInvoker = std::move(other.mBilinearInvoker); + } + return *this; + } + + float Run(const Argument& arg, + const StreamConfig& stream_config = StreamConfig{}) + { + auto r0 = mScaleInvoker->Run(arg.mScaleArgs[0].get(), stream_config); + auto r1 = mScaleInvoker->Run(arg.mScaleArgs[1].get(), stream_config); + auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config); + auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config); + + if(arg.mE_grid != nullptr) + { + auto blockDim = dim3(1024); + auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); + hiptensor::mfma<<>>(arg.mE_real.get(), + arg.mE_imag.get(), + arg.mD_real.get(), + arg.mD_imag.get(), + ((ComplexE*)arg.mE_grid), + arg.element_op.alpha_, + arg.element_op.beta_, + arg.elementsE); + } + + return r0 + r1 + r2 + r3; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + + std::unique_ptr mScaleInvoker; + std::unique_ptr mBilinearInvoker; + }; + + static bool IsSupportedArgument(const Argument& arg) + { + return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[0].get())) + && ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[1].get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get())); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + // polymorphic + virtual void SetWorkSpacePointer(BaseArgument* p_arg, + void* p_workspace, + StreamConfig const& s + = StreamConfig{}) const override + { + // Call the base, then fwd to each arg. + this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s); + auto* arg = dynamic_cast(p_arg); + this->BaseOperator::SetWorkSpacePointer( + arg->mScaleArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mScaleArgs[1].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[1].get(), p_workspace, s); + } + + static auto MakeArgument( + const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + BilinearCDEElementwiseOperation cde_element_op) + { + return Argument{p_a, + p_b, + p_ds, + p_e, + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op}; + } + + static auto MakeInvoker() + { + return Invoker{}; + } + + // polymorphic + std::unique_ptr MakeArgumentPointer( + const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + BilinearCDEElementwiseOperation cde_element_op) override + { + return std::make_unique(p_a, + p_b, + p_ds, + p_e, + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceContractionMultipleD_Xdl_CShuffle" + << "<" + << NumDimM << ", " + << NumDimN << ", " + << NumDimK << ", " + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << KPerBlock << ", " + << AK1 << ", " + << BK1 << ", " + << ABlockTransferSrcVectorDim << ", " + << BBlockTransferSrcVectorDim + << ">"; + // clang-format on + + return str.str(); + } + }; + + } // namespace device + } // namespace tensor_operation +} // namespace ck + +#endif // HIPTENSOR_CONTRACTION_BILINEAR_COMPLEX_HPP diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..3b3f6d47 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..fd43f0ad --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..21fb8127 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..cc975c03 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp new file mode 100644 index 00000000..4601021e --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance.cpp @@ -0,0 +1,91 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather +// than using default setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter +// of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp new file mode 100644 index 00000000..e3f60146 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp new file mode 100644 index 00000000..c2fd7c84 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp new file mode 100644 index 00000000..8203a4e5 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp new file mode 100644 index 00000000..9d779671 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance.cpp @@ -0,0 +1,91 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather +// than using default setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter +// of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp new file mode 100644 index 00000000..4197dda2 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp new file mode 100644 index 00000000..cc519368 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp new file mode 100644 index 00000000..ff187398 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_bilinear_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + using BilinearComplex = element_wise::BilinearComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..ff670630 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..be8bfe84 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..4be69898 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..2f6d630b --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp new file mode 100644 index 00000000..cc21216c --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp new file mode 100644 index 00000000..57c47457 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp new file mode 100644 index 00000000..a121fbb3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp new file mode 100644 index 00000000..7962da9f --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp new file mode 100644 index 00000000..ea2be147 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp new file mode 100644 index 00000000..d82ea442 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp new file mode 100644 index 00000000..772df2e3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp new file mode 100644 index 00000000..8b1d0681 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp index d8b80eb9..f924889f 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,42 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( @@ -89,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp index 5444adc3..ad94eb1f 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 1, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_kn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp index b20c1204..8fb870a0 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp index 2bc3d1f2..aa3e9d32 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 1, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..ad5ce461 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..ae3ee856 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..b72005ad --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..b94030e5 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp index a1fe1ddf..a65ae1eb 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> - // clang-format on - >; + = device_contraction_f64_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp index a635bce8..4d6ccaa8 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_kn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp index c77ffea4..071ccf62 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp index c8a96a70..d8223df7 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_complex.hpp b/library/src/contraction/device/device_contraction_scale_complex.hpp new file mode 100644 index 00000000..5b70cc11 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_complex.hpp @@ -0,0 +1,707 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#ifndef HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP +#define HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP + +#include "../contraction_pack_util.hpp" +#include "common.hpp" +#include + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + + using hiptensor::allocDevice; + using hiptensor::ceilDiv; + using hiptensor::DeviceDeleter; + using hiptensor::elementSpaceFromLengthsAndStrides; + + using Bilinear = ck::tensor_operation::element_wise::Bilinear; + using BilinearComplex = ck::tensor_operation::element_wise::BilinearComplex; + using Scale = ck::tensor_operation::element_wise::Scale; + using ScaleComplex = ck::tensor_operation::element_wise::ScaleComplex; + + // The following is a specialization class for bilinear contractions of complex types. + // For complex types, the contraction can be decomposed into 4 simple bilinear contractions of + // the complex element type. + // The class implements a CK interface to wrap the 4 individual contraction operations and argument + // handling internally. + // Note: We are assuming that the data comes in as an Array of Structures (AOS) format in complex pairs. + // The argument initialization portion decomposes this data into structure of arrays (SOA) where the + // real and complex elements can be operated on separately. + + // Tensor Contraction: + // input : A + // input : B + // input : D0, D1, ... + // output : E + // C = a_op(A) * b_op(B) + // E = cde_op(C, D0, D1, ...) + // Assume: + // A[M0, M1, M2, ..., K0, K1, K2, ...] + // B[N0, N1, N2, ..., K0, K1, K2, ...] + // D[M0, M1, M2, ..., N0, N1, N2, ...] + // E[M0, M1, M2, ..., N0, N1, N2, ...] + template + struct DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + HIP_vector_type, + HIP_vector_type, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + HIP_vector_type, + AElementwiseOperation, + BElementwiseOperation, + ScaleComplex, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + HIP_vector_type, + LoopSched> + + : public DeviceContractionMultipleD, + HIP_vector_type, + ck::Tuple<>, + HIP_vector_type, + AElementwiseOperation, + BElementwiseOperation, + ScaleComplex, + HIP_vector_type> + { + // Complex device Op + using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle; + + // CDE Operations + using ScaleCDEElementwiseOperation = ScaleComplex; + using DecompScaleCDEElementwiseOperation = Scale; + using BilinearCDEElementwiseOperation = BilinearComplex; + using DecompBilinearCDEElementwiseOperation = Bilinear; + + // Complex types given through the interface + using ComplexA = HIP_vector_type; + using ComplexB = HIP_vector_type; + using ComplexDs = HIP_vector_type; + using ComplexE = HIP_vector_type; + using ComplexCompute = HIP_vector_type; + + // Internal functional types we will use to + // decompose the complex types and operate on. + using DecompA = ADataType; + using DecompB = BDataType; + using DecompDs = EDataType; + using DecompE = EDataType; + using DecompCompute = ComputeDataType; + + // For complex types, we need to make sure that all of the types are the same + static_assert(std::is_same_v && std::is_same_v + && std::is_same_v + && std::is_same_v, + "Complex operations must have the same data type"); + + static_assert(std::is_same_v || std::is_same_v, + "Complex operations only supported with single or double precision"); + + static constexpr index_t NumDTensor = 0; + + // The internal operation that we will decompose the complex operations with. + // For complex will be either float or double + using ScaleDecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + DecompA, + DecompB, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + DecompE, + AElementwiseOperation, + BElementwiseOperation, + DecompScaleCDEElementwiseOperation, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + DecompCompute, + LoopSched>; + + // The internal operation that we will decompose the complex operations with. + // For complex will be either float or double + using BilinearDecompOp = DeviceContractionMultipleD_Xdl_CShuffle< + NumDimM, + NumDimN, + NumDimK, + DecompA, + DecompB, + AccDataType, + CShuffleDataType, + ck::Tuple, + DecompE, + AElementwiseOperation, + BElementwiseOperation, + DecompBilinearCDEElementwiseOperation, + GemmSpec, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + AK1, + BK1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + BBlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEBlockTransferScalarPerVector_NPerBlock, + DecompCompute, + LoopSched>; + + // Argument + struct Argument : public BaseArgument + { + using ScaleDecompArgument = typename ScaleDecompOp::Argument; + using BilinearDecompArgument = typename BilinearDecompOp::Argument; + + Argument(Argument&& other) + : mScaleArgs( + {std::move(other.mScaleArgs[0]), std::move(other.mScaleArgs[1])}) + , mBilinearArgs({std::move(other.mBilinearArgs[0]), + std::move(other.mBilinearArgs[1])}) + { + } + + Argument& operator=(Argument&& other) + { + if(this != &other) + { + mScaleArgs[0] = std::move(other.mScaleArgs[0]); + mScaleArgs[1] = std::move(other.mScaleArgs[1]); + mBilinearArgs[0] = std::move(other.mBilinearArgs[0]); + mBilinearArgs[1] = std::move(other.mBilinearArgs[1]); + } + return *this; + } + + Argument(const void* p_a_grid, + const void* p_b_grid, + std::array p_ds_grid, + void* p_e_grid, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + ScaleCDEElementwiseOperation cde_element_op) + : element_op(cde_element_op) + { + // Take the incoming arguments, treat them as complex. + + // Allocate Real and Imaginary inputs + auto elementsA + = elementSpaceFromLengthsAndStrides(a_ms_ks_lengths, a_ms_ks_strides); + auto elementsB + = elementSpaceFromLengthsAndStrides(b_ns_ks_lengths, b_ns_ks_strides); + elementsE + = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides); + + mA_real.reset(nullptr); + mA_imag.reset(nullptr); + mB_real.reset(nullptr); + mB_imag.reset(nullptr); + mE_real.reset(nullptr); + mE_imag.reset(nullptr); + + mE_grid = p_e_grid; + auto blockDim = dim3(1024); + + auto decompGrid = [blockDim](auto& out_r, + auto& out_i, + auto const* input_grid, + uint32_t elementCount) { + using DecompT = typename std::decay_t::element_type; + static_assert(std::is_same_v< + DecompT, + typename std::decay_t::element_type>, + "r and i buffers must be same type"); + + if(input_grid != nullptr) + { + out_r = std::move(allocDevice(elementCount)); + out_i = std::move(allocDevice(elementCount)); + + auto gridDim = dim3(ceilDiv(elementCount, blockDim.x)); + hiptensor::unpack<<>>( + input_grid, out_r.get(), out_i.get(), elementCount); + } + }; + + // Decompose the incoming data from AOS->SOA + decompGrid(mA_real, mA_imag, (const ComplexA*)p_a_grid, elementsA); + decompGrid(mB_real, mB_imag, (const ComplexB*)p_b_grid, elementsB); + decompGrid(mE_real, mE_imag, (const ComplexE*)p_e_grid, elementsE); + + auto allocScaleArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& cde_element_op) { + return std::make_unique( + in_a.get(), + in_b.get(), + std::array{}, + out_e.get(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 0>{}, + std::array, 0>{}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + }; + + auto allocBilinearArgs = [a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op](auto& out_e, + auto const& in_a, + auto const& in_b, + auto const& in_d, + auto const& cde_element_op) { + return std::make_unique( + in_a.get(), + in_b.get(), + std::array{in_d.get()}, + out_e.get(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{e_ms_ns_lengths}, + std::array, 1>{e_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + }; + + mScaleArgs[0] = allocScaleArgs( + mE_real, mA_real, mB_real, DecompScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[0] + = allocBilinearArgs(mE_real, + mA_imag, + mB_imag, + mE_real, + DecompBilinearCDEElementwiseOperation{-1.0f, 1.0f}); + + mScaleArgs[1] = allocScaleArgs( + mE_imag, mA_real, mB_imag, DecompScaleCDEElementwiseOperation{1.0f}); + mBilinearArgs[1] + = allocBilinearArgs(mE_imag, + mA_imag, + mB_real, + mE_imag, + DecompBilinearCDEElementwiseOperation{1.0f, 1.0f}); + } + + void Print() const + { + std::cout << "ScaleArgs0:" << std::endl; + mScaleArgs[0]->Print(); + std::cout << "ScaleArgs1:" << std::endl; + mScaleArgs[1]->Print(); + std::cout << "BilinearArgs0:" << std::endl; + mBilinearArgs[0]->Print(); + std::cout << "BilinearArgs1:" << std::endl; + mBilinearArgs[1]->Print(); + } + + // private: + // Each argument set for complex: + std::unique_ptr mScaleArgs[2]; + std::unique_ptr mBilinearArgs[2]; + + template + using DeviceArray = std::unique_ptr; + + // Manage extra memory for AOS->SOA + DeviceArray mA_real; + DeviceArray mA_imag; + DeviceArray mB_real; + DeviceArray mB_imag; + DeviceArray mE_real; + DeviceArray mE_imag; + + ScaleCDEElementwiseOperation element_op; + void* mE_grid; + index_t elementsE; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = typename DeviceOp::Argument; + + Invoker() + : mScaleInvoker(std::make_unique()) + , mBilinearInvoker(std::make_unique()) + { + } + + Invoker(Invoker&& other) + : mScaleInvoker(std::move(other.mScaleInvoker)) + , mBilinearInvoker(std::move(other.mBilinearInvoker)) + { + } + + Invoker& operator=(Invoker&& other) + { + if(this != &other) + { + mScaleInvoker = std::move(other.mScaleInvoker); + mBilinearInvoker = std::move(other.mBilinearInvoker); + } + return *this; + } + + float Run(const Argument& arg, + const StreamConfig& stream_config = StreamConfig{}) + { + auto r0 = mScaleInvoker->Run(arg.mScaleArgs[0].get(), stream_config); + auto r1 = mScaleInvoker->Run(arg.mScaleArgs[1].get(), stream_config); + auto r2 = mBilinearInvoker->Run(arg.mBilinearArgs[0].get(), stream_config); + auto r3 = mBilinearInvoker->Run(arg.mBilinearArgs[1].get(), stream_config); + + if(arg.mE_grid != nullptr) + { + auto blockDim = dim3(1024); + auto gridDim = dim3(ceilDiv(arg.elementsE, blockDim.x)); + + hiptensor::multiply<<>>(arg.mE_real.get(), + arg.mE_imag.get(), + ((ComplexE*)arg.mE_grid), + arg.element_op.scale_, + arg.elementsE); + } + + return r0 + r1 + r2 + r3; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + + std::unique_ptr mScaleInvoker; + std::unique_ptr mBilinearInvoker; + }; + + static bool IsSupportedArgument(const Argument& arg) + { + return ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[0].get())) + && ScaleDecompOp::IsSupportedArgument(*(arg.mScaleArgs[1].get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[0].get())) + && BilinearDecompOp::IsSupportedArgument(*(arg.mBilinearArgs[1].get())); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + // polymorphic + virtual void SetWorkSpacePointer(BaseArgument* p_arg, + void* p_workspace, + StreamConfig const& s + = StreamConfig{}) const override + { + // Call the base, then fwd to each arg. + this->BaseOperator::SetWorkSpacePointer(p_arg, p_workspace, s); + auto* arg = dynamic_cast(p_arg); + this->BaseOperator::SetWorkSpacePointer( + arg->mScaleArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mScaleArgs[1].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[0].get(), p_workspace, s); + this->BaseOperator::SetWorkSpacePointer( + arg->mBilinearArgs[1].get(), p_workspace, s); + } + + static auto MakeArgument( + const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + ScaleCDEElementwiseOperation cde_element_op) + { + return Argument{p_a, + p_b, + p_ds, + p_e, + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op}; + } + + static auto MakeInvoker() + { + return Invoker{}; + } + + // polymorphic + std::unique_ptr MakeArgumentPointer( + const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_ms_ks_lengths, + const std::vector& a_ms_ks_strides, + const std::vector& b_ns_ks_lengths, + const std::vector& b_ns_ks_strides, + const std::array, NumDTensor>& ds_ms_ns_lengths, + const std::array, NumDTensor>& ds_ms_ns_strides, + const std::vector& e_ms_ns_lengths, + const std::vector& e_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + ScaleCDEElementwiseOperation cde_element_op) override + { + return std::make_unique(p_a, + p_b, + p_ds, + p_e, + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + ds_ms_ns_lengths, + ds_ms_ns_strides, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceContractionMultipleD_Xdl_CShuffle" + << "<" + << NumDimM << ", " + << NumDimN << ", " + << NumDimK << ", " + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << KPerBlock << ", " + << AK1 << ", " + << BK1 << ", " + << ABlockTransferSrcVectorDim << ", " + << BBlockTransferSrcVectorDim + << ">"; + // clang-format on + + return str.str(); + } + }; + + } // namespace device + } // namespace tensor_operation +} // namespace ck + +#endif // HIPTENSOR_CONTRACTION_SCALE_COMPLEX_HPP diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..1da8301f --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..82c17500 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..1febb560 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..02b9d719 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp new file mode 100644 index 00000000..3133f4cd --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance.cpp @@ -0,0 +1,92 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather +// than using default setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter +// of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance + = device_contraction_kk_instance; + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck + diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp new file mode 100644 index 00000000..b358be8a --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp new file mode 100644 index 00000000..359a074a --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp new file mode 100644 index 00000000..4cc8659d --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp new file mode 100644 index 00000000..1cac8ebb --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance.cpp @@ -0,0 +1,92 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather +// than using default setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter +// of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp new file mode 100644 index 00000000..e60bbd61 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp new file mode 100644 index 00000000..e44d24e1 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp new file mode 100644 index 00000000..dee9ce39 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance.cpp @@ -0,0 +1,89 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include "common.hpp" +#include "device_contraction_scale_complex.hpp" + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + using F64 = double; + using CF64 = hipDoubleComplex; + using Empty_Tuple = ck::Tuple<>; + using ScaleComplex = element_wise::ScaleComplex; + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance{}); + } + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..5917e466 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..216f470e --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..3401b605 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..fe2fa97d --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp new file mode 100644 index 00000000..9a104075 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp new file mode 100644 index 00000000..6a7f565f --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp new file mode 100644 index 00000000..094655bb --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp new file mode 100644 index 00000000..583b5b00 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp new file mode 100644 index 00000000..8eec79cf --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp new file mode 100644 index 00000000..a8999be8 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp new file mode 100644 index 00000000..e4e4b7de --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp new file mode 100644 index 00000000..a641f6e3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp index 88345e74..24d2d570 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,42 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp index 38702afd..f559dc06 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/n/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 1, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_kn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp index 735a5e34..a522052d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // m/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp index d286e2d8..be35683b 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // m/n/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 1, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..dac46620 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..0830b49f --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..9a716ba3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..e02ac144 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp index f8904a8f..6f168ee2 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> - // clang-format on - >; + = device_contraction_f64_kk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp index 56fc8b91..347a810c 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_kn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp index 231a0256..229d18c7 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp index 4fc648d4..bf1efa14 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_element_wise_operation_complex.hpp b/library/src/contraction/device/device_element_wise_operation_complex.hpp new file mode 100644 index 00000000..a01ced36 --- /dev/null +++ b/library/src/contraction/device/device_element_wise_operation_complex.hpp @@ -0,0 +1,97 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#ifndef HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP +#define HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP + +#include +#include +#include + +namespace ck { +namespace tensor_operation { +namespace element_wise { + +struct ScaleComplex : public Scale +{ + __host__ __device__ ScaleComplex(hipDoubleComplex scale) : Scale(hipCreal(scale)) + { + scale_ = scale; + } + + template + __host__ __device__ void operator()(Y& y, const X& x) const; + + template <> + __host__ __device__ void operator()(hipFloatComplex& y, const hipFloatComplex& x) const + { + y = hipCmulf(hipComplexDoubleToFloat(scale_), x); + }; + + template <> + __host__ __device__ void operator()(hipDoubleComplex& y, const hipDoubleComplex& x) const + { + y = hipCmul(scale_, x); + }; + + // complex * float + hipDoubleComplex scale_; +}; + +struct BilinearComplex : public Bilinear +{ + BilinearComplex(hipDoubleComplex alpha, hipDoubleComplex beta) : Bilinear(hipCreal(alpha), hipCreal(beta)) + { + alpha_ = alpha; + beta_ = beta; + } + + template + __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&) const; + + template <> + __host__ __device__ constexpr void + operator()(hipDoubleComplex& y, const hipDoubleComplex& x0, const hipDoubleComplex& x1) const + { + y = hipCadd(hipCmul(alpha_, x0), hipCmul(beta_, x1)); + }; + + template <> + __host__ __device__ constexpr void + operator()(hipFloatComplex& y, const hipFloatComplex& x0, const hipFloatComplex& x1) const + { + y = hipCaddf(hipCmulf(hipComplexDoubleToFloat(alpha_), x0), hipCmulf(hipComplexDoubleToFloat(beta_), x1)); + }; + + hipDoubleComplex alpha_; + hipDoubleComplex beta_; +}; + +} // namespace element_wise +} // namespace tensor_operation +} // namespace ck + +#endif // HIPTENSOR_ELEMENT_WISE_OPERATION_COMPLEX_HPP diff --git a/library/src/contraction/device/contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp similarity index 73% rename from library/src/contraction/device/contraction_bilinear.hpp rename to library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp index e8f73b58..81d7edf5 100644 --- a/library/src/contraction/device/contraction_bilinear.hpp +++ b/library/src/contraction/device/hiptensor_contraction_bilinear_instances.hpp @@ -37,120 +37,128 @@ namespace ck { namespace instance { + using F32 = float; + using CF32 = hipFloatComplex; + using CF32_Tuple = ck::Tuple; + + using F64 = double; + using CF64 = hipDoubleComplex; + using CF64_Tuple = ck::Tuple; + + using BilinearComplex = element_wise::BilinearComplex; - // float void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance( std::vector>>& - instances); + BilinearComplex, + CF32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance( std::vector>>& - instances); + BilinearComplex, + CF32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance( std::vector>>& - instances); + BilinearComplex, + CF32>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance( std::vector>>& - instances); + BilinearComplex, + CF32>>>& instances); // double void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance( std::vector>>& - instances); + BilinearComplex, + CF64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance( std::vector>>& - instances); + BilinearComplex, + CF64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance( std::vector>>& - instances); + BilinearComplex, + CF64>>>& instances); void - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance( std::vector>>& - instances); + BilinearComplex, + CF64>>>& instances); // Contraction + Bilinear template + typename DsDataType, + typename EDataType, + typename ComputeDataT> struct DeviceOperationInstanceFactory< ck::tensor_operation::device::DeviceContractionMultipleD< NumDimM, NumDimN, NumDimK, - ADataType, - BDataType, - ck::Tuple, - EDataType, + HIP_vector_type, + HIP_vector_type, + ck::Tuple>, + HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>> + ck::tensor_operation::element_wise::BilinearComplex, + HIP_vector_type>> { using DeviceOp = DeviceContractionMultipleD< NumDimM, NumDimN, NumDimK, - ADataType, - BDataType, - ck::Tuple, - EDataType, + HIP_vector_type, + HIP_vector_type, + ck::Tuple>, + HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>; + ck::tensor_operation::element_wise::BilinearComplex, + HIP_vector_type>; static auto GetInstances() { std::vector> op_ptrs; if constexpr(is_same_v && is_same_v - && is_same_v && is_same_v) + && is_same_v && is_same_v) { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_kknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_knnn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_cf32_compute_cf32_mnnn_instance( op_ptrs); } } if constexpr(is_same_v && is_same_v - && is_same_v + && is_same_v && is_same_v) { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_kknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_knnn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mknn_instance( op_ptrs); - add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_cf64_compute_cf64_mnnn_instance( op_ptrs); } } diff --git a/library/src/contraction/device/contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp similarity index 62% rename from library/src/contraction/device/contraction_scale.hpp rename to library/src/contraction/device/hiptensor_contraction_scale_instances.hpp index 916f79de..705ac6c0 100644 --- a/library/src/contraction/device/contraction_scale.hpp +++ b/library/src/contraction/device/hiptensor_contraction_scale_instances.hpp @@ -37,136 +37,161 @@ namespace ck { namespace instance { - - // float - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( - std::vector>>& instances); - - // double - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( - std::vector>>& instances); - - void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( - std::vector>>& instances); - + using F32 = float; + using CF32 = hipFloatComplex; + using Empty_Tuple = ck::Tuple<>; + + using F64 = double; + using CF64 = hipDoubleComplex; + + using ScaleComplex = element_wise::ScaleComplex; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance( + std::vector>>& instances); + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance( + std::vector>>& instances); + // Contraction + Scale template - struct HipTensorDeviceOperationInstanceFactory< + typename EDataType, + typename ComputeDataType> + struct DeviceOperationInstanceFactory< ck::tensor_operation::device::DeviceContractionMultipleD< NumDimM, NumDimN, NumDimK, - ADataType, - BDataType, + HIP_vector_type, + HIP_vector_type, ck::Tuple<>, - EDataType, + HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>> + ck::tensor_operation::element_wise::ScaleComplex, + HIP_vector_type>> { using DeviceOp = DeviceContractionMultipleD< NumDimM, NumDimN, NumDimK, - ADataType, - BDataType, + HIP_vector_type, + HIP_vector_type, ck::Tuple<>, - EDataType, + HIP_vector_type, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>; + ck::tensor_operation::element_wise::ScaleComplex, + HIP_vector_type>; static auto GetInstances() { @@ -177,13 +202,13 @@ namespace ck { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_kkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_knn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf32_cf32_cf32_compute_cf32_mnn_instance( op_ptrs); } } @@ -193,13 +218,13 @@ namespace ck { if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) { - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_kkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_knn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mkn_instance( op_ptrs); - add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_cf64_cf64_cf64_compute_cf64_mnn_instance( op_ptrs); } } diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index 09f5ddf6..eb7d8919 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -147,7 +147,11 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t* // Use a scale contraction due to // tensor C-descriptor is empty - *desc = {(int32_t)hiptensor::ContractionOpId_t::SCALE, + auto contractionOp + = typeCompute == HIPTENSOR_COMPUTE_C32F || typeCompute == HIPTENSOR_COMPUTE_C64F + ? hiptensor::ContractionOpId_t::SCALE_COMPLEX + : hiptensor::ContractionOpId_t::SCALE; + *desc = {(int32_t)contractionOp, typeCompute, {*descA, *descB, @@ -161,7 +165,11 @@ hiptensorStatus_t hiptensorInitContractionDescriptor(const hiptensorHandle_t* { // Use a bilinear contraction due to // tensor C-descriptor is not empty - *desc = {(int32_t)hiptensor::ContractionOpId_t::BILINEAR, + auto contractionOp + = typeCompute == HIPTENSOR_COMPUTE_C32F || typeCompute == HIPTENSOR_COMPUTE_C64F + ? hiptensor::ContractionOpId_t::BILINEAR_COMPLEX + : hiptensor::ContractionOpId_t::BILINEAR; + *desc = {(int32_t)contractionOp, typeCompute, {*descA, *descB, *descC, *descD}, {alignmentRequirementA, @@ -242,17 +250,6 @@ hiptensorStatus_t hiptensorInitContractionFind(const hiptensorHandle_t* handl auto& instances = hiptensor::ContractionSolutionInstances::instance(); auto solnQ = instances->allSolutions(); - // Check if the current device supports F64 - if(!currentDevice.supportsF64()) - { - // Allow only supported f32 combos - solnQ = solnQ.query(HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F) || // Bilinear F32 - solnQ.query(HIP_R_32F, - HIP_R_32F, - hipDataType(hiptensor::NONE_TYPE), - HIP_R_32F); // Scale F32 (no C) - } - // Can do more checking for scale / bilinear, etc. if we need to. if(solnQ.solutionCount() == 0) @@ -461,15 +458,16 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* // Convert to concrete contraction solutions auto candidates = toContractionSolutionVec(find->mCandidates); - auto ADataType = desc->mTensorDesc[0].mType; - auto BDataType = desc->mTensorDesc[1].mType; - auto DDataType = desc->mTensorDesc[2].mType; - auto EDataType = desc->mTensorDesc[3].mType; + auto computeType = desc->mComputeType; + auto ADataType = desc->mTensorDesc[0].mType; + auto BDataType = desc->mTensorDesc[1].mType; + auto DDataType = desc->mTensorDesc[2].mType; + auto EDataType = desc->mTensorDesc[3].mType; // Query contraction solutions for the correct contraction operation and type auto solutionQ = hiptensor::ContractionSolutionRegistry::Query{candidates} .query((hiptensor::ContractionOpId_t)desc->mContractionOpId) - .query(ADataType, BDataType, DDataType, EDataType); + .query(ADataType, BDataType, DDataType, EDataType, computeType); candidates = toContractionSolutionVec(solutionQ.solutions()); @@ -500,6 +498,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* EDataType, desc->mTensorDesc[3].mLengths, desc->mTensorDesc[3].mStrides, + desc->mComputeType, workspaceSize); } else if(find->mSelectionAlgorithm == HIPTENSOR_ALGO_ACTOR_CRITIC) @@ -518,6 +517,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* EDataType, desc->mTensorDesc[3].mLengths, desc->mTensorDesc[3].mStrides, + desc->mComputeType, workspaceSize); } @@ -582,18 +582,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F) - { - snprintf( - alphaMsg, sizeof(alphaMsg), "alpha=%.6f", *(static_cast(alpha))); - } - else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F) - { - snprintf(alphaMsg, - sizeof(alphaMsg), - "alpha=%.6lf", - *(static_cast(alpha))); - } + auto alphaValue = hiptensor::readVal( + alpha, plan->mContractionDesc.mComputeType); + snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%s", std::to_string(alphaValue).c_str()); } if(beta == nullptr) @@ -602,15 +593,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F) - { - snprintf(betaMsg, sizeof(betaMsg), "beta=%.6f", *(static_cast(beta))); - } - else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F) - { - snprintf( - betaMsg, sizeof(betaMsg), "beta=%.6lf", *(static_cast(beta))); - } + auto betaValue = hiptensor::readVal( + beta, plan->mContractionDesc.mComputeType); + snprintf(betaMsg, sizeof(betaMsg), "beta=%s", std::to_string(betaValue).c_str()); } } else @@ -708,17 +693,6 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, return errorCode; } - if(plan->mContractionDesc.mComputeType != plan->mContractionDesc.mTensorDesc[3].mType) - { - auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; - snprintf(msg, - sizeof(msg), - "Internal Error : compute type != D type (%s)", - hiptensorGetErrorString(errorCode)); - logger->logError("hiptensorContraction", msg); - return errorCode; - } - auto* cSolution = (hiptensor::ContractionSolution*)(plan->mSolution); auto canRun = cSolution->initArgs(alpha, @@ -755,7 +729,17 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, // Perform contraction with timing if LOG_LEVEL_PERF_TRACE if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE) { - auto time = (*cSolution)(StreamConfig{stream, true}); + auto time = (*cSolution)(StreamConfig{ + stream, // stream id + true, // time_kernel + 0, // log_level + 0, // cold_niters + 1, // nrepeat + }); + if(time < 0) + { + return HIPTENSOR_STATUS_CK_ERROR; + } int32_t m, n, k; std::tie(m, n, k) = cSolution->problemDims(); @@ -784,7 +768,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, // Perform contraction without timing else { - (*cSolution)(StreamConfig{stream, false}); + if((*cSolution)(StreamConfig{stream, false}) < 0) + { + return HIPTENSOR_STATUS_CK_ERROR; + } } return HIPTENSOR_STATUS_SUCCESS; diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp index b270973d..5a31a91f 100644 --- a/library/src/data_types.cpp +++ b/library/src/data_types.cpp @@ -79,6 +79,14 @@ namespace hiptensor { return sizeof(uint64_t); } + else if(id == HIP_C_32F) + { + return sizeof(hipFloatComplex); + } + else if(id == HIP_C_64F) + { + return sizeof(hipDoubleComplex); + } else if(id == NONE_TYPE) { return 0; @@ -126,12 +134,124 @@ namespace hiptensor { return HIPTENSOR_COMPUTE_32U; } + else if(hipType == HIP_C_32F) + { + return HIPTENSOR_COMPUTE_C32F; + } + else if(hipType == HIP_C_64F) + { + return HIPTENSOR_COMPUTE_C64F; + } else { return HIPTENSOR_COMPUTE_NONE; } } + template <> + ScalarData readVal(void const* value, hiptensorComputeType_t id) + { + if(id == HIPTENSOR_COMPUTE_16F) + { + return ScalarData(id, *(_Float16*)value); + } + else if(id == HIPTENSOR_COMPUTE_16BF) + { + return ScalarData(id, *(hip_bfloat16*)value); + } + else if(id == HIPTENSOR_COMPUTE_32F) + { + return ScalarData(id, *(float*)value); + } + else if(id == HIPTENSOR_COMPUTE_64F) + { + return ScalarData(id, *(double*)value); + } + else if(id == HIPTENSOR_COMPUTE_8U) + { + return ScalarData(id, *(uint8_t*)value); + } + else if(id == HIPTENSOR_COMPUTE_8I) + { + return ScalarData(id, *(int8_t*)value); + } + else if(id == HIPTENSOR_COMPUTE_32U) + { + return ScalarData(id, *(uint32_t*)value); + } + else if(id == HIPTENSOR_COMPUTE_32I) + { + return ScalarData(id, *(int32_t*)value); + } + else if(id == HIPTENSOR_COMPUTE_C32F) + { + auto complex = *(hipFloatComplex*)value; + return {id, complex.x, complex.y}; + } + else if(id == HIPTENSOR_COMPUTE_C64F) + { + auto complex = *(hipDoubleComplex*)value; + return {id, complex.x, complex.y}; + } + else + { +#if !NDEBUG + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; +#endif // !NDEBUG + return {HIPTENSOR_COMPUTE_NONE, 0, 0}; + } + } + + void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value) + { + if(id == HIPTENSOR_COMPUTE_16F) + { + *(_Float16*)addr = value.mReal; + } + else if(id == HIPTENSOR_COMPUTE_16BF) + { + *(hip_bfloat16*)addr = value.mReal; + } + else if(id == HIPTENSOR_COMPUTE_32F) + { + *(float*)addr = value.mReal; + } + else if(id == HIPTENSOR_COMPUTE_64F) + { + *(double*)addr = value.mReal; + } + else if(id == HIPTENSOR_COMPUTE_8U) + { + *(uint8_t*)addr = (uint8_t)value.mReal; + } + else if(id == HIPTENSOR_COMPUTE_8I) + { + *(int8_t*)addr = (int8_t)value.mReal; + } + else if(id == HIPTENSOR_COMPUTE_32U) + { + *(uint32_t*)addr = (uint32_t)value.mReal; + } + else if(id == HIPTENSOR_COMPUTE_32I) + { + *(int32_t*)addr = (int32_t)value.mReal; + } + else if(id == HIPTENSOR_COMPUTE_C32F) + { + *(hipFloatComplex*)addr = hipComplexDoubleToFloat(value.mComplex); + } + else if(id == HIPTENSOR_COMPUTE_C64F) + { + *(hipDoubleComplex*)addr = value.mComplex; + } + else + { +#if !NDEBUG + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; +#endif // !NDEBUG + return; + } + } } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType) @@ -144,11 +264,11 @@ bool operator==(hipDataType hipType, hiptensorComputeType_t computeType) { return (computeType == HIPTENSOR_COMPUTE_16F); } - else if(hipType == HIP_R_32F) + else if(hipType == HIP_R_32F || hipType == HIP_C_32F) { return (computeType == HIPTENSOR_COMPUTE_32F); } - else if(hipType == HIP_R_64F) + else if(hipType == HIP_R_64F || hipType == HIP_C_64F) { return (computeType == HIPTENSOR_COMPUTE_64F); } @@ -207,3 +327,19 @@ bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType) { return !(computeType == hipType); } + +namespace std +{ + std::string to_string(const hiptensor::ScalarData& value) + { + if(value.mType == HIPTENSOR_COMPUTE_C32F || value.mType == HIPTENSOR_COMPUTE_C64F) + { + return string() + "[" + to_string(value.mComplex.x) + ", " + to_string(value.mComplex.y) + + "]"; + } + else + { + return to_string(value.mReal); + } + } +} diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp index 9740d2a8..8d185758 100644 --- a/library/src/hiptensor.cpp +++ b/library/src/hiptensor.cpp @@ -152,7 +152,9 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t* han } if((lens == nullptr) - || ((dataType != HIP_R_16F) && (dataType != HIP_R_32F) && (dataType != HIP_R_64F)) + || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F) + && (dataType != HIP_R_64F) && (dataType != HIP_C_32F) + && (dataType != HIP_C_64F)) || unaryOp != HIPTENSOR_OP_IDENTITY) { auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp index 42197650..db9ff6c7 100644 --- a/library/src/include/data_types.hpp +++ b/library/src/include/data_types.hpp @@ -31,6 +31,7 @@ // Include order needs to be preserved #include #include +#include #include #include @@ -43,6 +44,46 @@ namespace hiptensor // Used to map to empty tensors struct NoneType; + struct ScalarData + { + hiptensorComputeType_t mType; + union + { + double mReal; + hipDoubleComplex mComplex; + }; + + ScalarData() = default; + ScalarData(hiptensorComputeType_t type, double real, double imag = 0) + { + mType = type; + if(type == HIPTENSOR_COMPUTE_C32F || type == HIPTENSOR_COMPUTE_C64F) + { + mComplex = make_hipDoubleComplex(real, imag); + } + else + { + mReal = real; + } + } + operator float() const + { + return static_cast(mReal); + } + operator double() const + { + return mReal; + } + operator hipFloatComplex() const + { + return hipComplexDoubleToFloat(mComplex); + } + operator hipDoubleComplex() const + { + return mComplex; + } + }; + static constexpr hipDataType NONE_TYPE = (hipDataType)31; // Map type to runtime HipDataType @@ -65,6 +106,7 @@ namespace hiptensor template T readVal(void const* value, hiptensorComputeType_t id); + void writeVal(void const* addr, hiptensorComputeType_t id, ScalarData value); } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType); @@ -73,6 +115,11 @@ bool operator==(hiptensorComputeType_t computeType, hipDataType hipType); bool operator!=(hipDataType hipType, hiptensorComputeType_t computeType); bool operator!=(hiptensorComputeType_t computeType, hipDataType hipType); +namespace std +{ + std::string to_string(const hiptensor::ScalarData& value); +} + #include "data_types_impl.hpp" #endif // HIPTENSOR_LIBRARY_DATA_TYPES_HPP diff --git a/library/src/include/data_types_impl.hpp b/library/src/include/data_types_impl.hpp index 7df6d7d9..c55f0d7e 100644 --- a/library/src/include/data_types_impl.hpp +++ b/library/src/include/data_types_impl.hpp @@ -105,6 +105,18 @@ namespace hiptensor static constexpr auto value = HIP_R_64U; }; + template <> + struct HipDataType + { + static constexpr auto value = HIP_C_32F; + }; + + template <> + struct HipDataType + { + static constexpr auto value = HIP_C_64F; + }; + template <> struct HipDataType { @@ -162,6 +174,14 @@ namespace hiptensor { return static_cast(*(uint64_t*)value); } + else if constexpr(std::is_same_v && id == HIP_C_32F) + { + return static_cast(*(hipFloatComplex*)value); + } + else if constexpr(std::is_same_v && id == HIP_C_64F) + { + return static_cast(*(hipDoubleComplex*)value); + } else { #if !NDEBUG @@ -215,6 +235,8 @@ namespace hiptensor } } + template <> + ScalarData readVal(void const* value, hiptensorComputeType_t id); } // namespace hiptensor #endif // HIPTENSOR_LIBRARY_DATA_TYPES_IMPL_HPP diff --git a/library/src/include/meta_traits.hpp b/library/src/include/meta_traits.hpp index 0e039cd6..2cd0d740 100644 --- a/library/src/include/meta_traits.hpp +++ b/library/src/include/meta_traits.hpp @@ -32,7 +32,7 @@ namespace hiptensor // Placeholder for building traits on any type T // Use partial or full specialization for any class. - template + template struct MetaTraits; } // namespace hiptensor diff --git a/library/src/include/xfloat32.hpp b/library/src/include/xfloat32.hpp deleted file mode 100644 index 6e9168cf..00000000 --- a/library/src/include/xfloat32.hpp +++ /dev/null @@ -1,334 +0,0 @@ -/* ************************************************************************ - * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- - * ies of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- - * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR - * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- - * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ************************************************************************ */ - -/*!\file - * \brief xfloat32.h provides struct for hiptensor_xfloat32 typedef - */ - -#ifndef HIPTENSOR_XFLOAT32_HPP -#define HIPTENSOR_XFLOAT32_HPP - -#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) - -// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only -// include a minimal definition of hiptensor_xfloat32 - -#include -typedef struct -{ - float data; -} hiptensor_xfloat32; - -#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) - -#include -#include -#include -#include -#include -#include - -#include "config.hpp" - -struct hiptensor_xfloat32 -{ - float data; - - enum round_t - { - round_up - }; - - HIPTENSOR_HOST_DEVICE hiptensor_xfloat32() = default; - - // round upper 19 bits of IEEE float to convert to xfloat32 - explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f, round_t) - : data(float_to_xfloat32(f)) - { - } - - explicit HIPTENSOR_HOST_DEVICE hiptensor_xfloat32(float f) - : data(truncate_float_to_xfloat32(f)) - { - } - - // zero extend lower 13 bits of xfloat32 to convert to IEEE float - HIPTENSOR_HOST_DEVICE operator float() const - { - return data; - } - - explicit HIPTENSOR_HOST_DEVICE operator bool() const - { - union - { - float fp32; - uint32_t int32; - } u = {data}; - return u.int32 & 0x7fffe000; - } - - explicit HIPTENSOR_HOST_DEVICE operator uint32_t() const - { - return uint32_t(float(*this)); - } - - explicit HIPTENSOR_HOST_DEVICE operator long() const - { - return long(float(*this)); - } - - explicit HIPTENSOR_HOST_DEVICE operator double() const - { - return double(float(*this)); - } - -private: - static HIPTENSOR_HOST_DEVICE float float_to_xfloat32(float f) - { - union - { - float fp32; - uint32_t int32; - } u = {f}; - if(~u.int32 & 0x7f800000) - { - // When the exponent bits are not all 1s, then the value is zero, normal, - // or subnormal. We round the xfloat32 mantissa up by adding 0xFFF, plus - // 1 if the least significant bit of the xfloat32 mantissa is 1 (odd). - // This causes the xfloat32's mantissa to be incremented by 1 if the 13 - // least significant bits of the float mantissa are greater than 0x1000, - // or if they are equal to 0x1000 and the least significant bit of the - // xfloat32 mantissa is 1 (odd). This causes it to be rounded to even when - // the lower 13 bits are exactly 0x1000. If the xfloat32 mantissa already - // has the value 0x3ff, then incrementing it causes it to become 0x00 and - // the exponent is incremented by one, which is the next higher FP value - // to the unrounded xfloat32 value. When the xfloat32 value is subnormal - // with an exponent of 0x00 and a mantissa of 0x3FF, it may be rounded up - // to a normal value with an exponent of 0x01 and a mantissa of 0x00. - // When the xfloat32 value has an exponent of 0xFE and a mantissa of 0x3FF, - // incrementing it causes it to become an exponent of 0xFF and a mantissa - // of 0x00, which is Inf, the next higher value to the unrounded value. - - u.int32 += 0xfff + ((u.int32 >> 13) & 1); // Round to nearest, round to even - } - else if(u.int32 & 0x1fff) - { - // When all of the exponent bits are 1, the value is Inf or NaN. - // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero - // mantissa bit. Quiet NaN is indicated by the most significant mantissa - // bit being 1. Signaling NaN is indicated by the most significant - // mantissa bit being 0 but some other bit(s) being 1. If any of the - // lower 13 bits of the mantissa are 1, we set the least significant bit - // of the xfloat32 mantissa, in order to preserve signaling NaN in case - // the xfloat32's mantissa bits are all 0. - u.int32 |= 0x2000; // Preserve signaling NaN - } - - u.int32 &= 0xffffe000; - return u.fp32; - } - - // Truncate instead of rounding - static HIPTENSOR_HOST_DEVICE float truncate_float_to_xfloat32(float f) - { - union - { - float fp32; - uint32_t int32; - } u = {f}; - - u.int32 = u.int32 & 0xffffe000; - return u.fp32; - } -}; - -typedef struct -{ - float data; -} hiptensor_xfloat32_public; - -static_assert(std::is_standard_layout{}, - "hiptensor_xfloat32 is not a standard layout type, and thus is " - "incompatible with C."); - -static_assert(std::is_trivial{}, - "hiptensor_xfloat32 is not a trivial type, and thus is " - "incompatible with C."); - -static_assert(sizeof(hiptensor_xfloat32) == sizeof(hiptensor_xfloat32_public) - && offsetof(hiptensor_xfloat32, data) - == offsetof(hiptensor_xfloat32_public, data), - "internal hiptensor_xfloat32 does not match public hiptensor_xfloat32"); - -inline std::ostream& operator<<(std::ostream& os, const hiptensor_xfloat32& xf32) -{ - return os << float(xf32); -} - -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a) -{ - return a; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a) -{ - union - { - float fp32; - uint32_t int32; - } u = {a.data}; - u.int32 ^= 0x80000000; - return hiptensor_xfloat32(u.fp32); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator+(hiptensor_xfloat32 a, - hiptensor_xfloat32 b) -{ - return hiptensor_xfloat32(float(a) + float(b)); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator-(hiptensor_xfloat32 a, - hiptensor_xfloat32 b) -{ - return hiptensor_xfloat32(float(a) - float(b)); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator*(hiptensor_xfloat32 a, - hiptensor_xfloat32 b) -{ - return hiptensor_xfloat32(float(a) * float(b)); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator/(hiptensor_xfloat32 a, - hiptensor_xfloat32 b) -{ - return hiptensor_xfloat32(float(a) / float(b)); -} -inline HIPTENSOR_HOST_DEVICE bool operator<(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return float(a) < float(b); -} -inline HIPTENSOR_HOST_DEVICE bool operator==(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return float(a) == float(b); -} -inline HIPTENSOR_HOST_DEVICE bool operator>(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return b < a; -} -inline HIPTENSOR_HOST_DEVICE bool operator<=(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return !(a > b); -} -inline HIPTENSOR_HOST_DEVICE bool operator!=(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return !(a == b); -} -inline HIPTENSOR_HOST_DEVICE bool operator>=(hiptensor_xfloat32 a, hiptensor_xfloat32 b) -{ - return !(a < b); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator+=(hiptensor_xfloat32& a, - hiptensor_xfloat32 b) -{ - return a = a + b; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator-=(hiptensor_xfloat32& a, - hiptensor_xfloat32 b) -{ - return a = a - b; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator*=(hiptensor_xfloat32& a, - hiptensor_xfloat32 b) -{ - return a = a * b; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator/=(hiptensor_xfloat32& a, - hiptensor_xfloat32 b) -{ - return a = a / b; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator++(hiptensor_xfloat32& a) -{ - return a += hiptensor_xfloat32(1.0f); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32& operator--(hiptensor_xfloat32& a) -{ - return a -= hiptensor_xfloat32(1.0f); -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator++(hiptensor_xfloat32& a, int) -{ - hiptensor_xfloat32 orig = a; - ++a; - return orig; -} -inline HIPTENSOR_HOST_DEVICE hiptensor_xfloat32 operator--(hiptensor_xfloat32& a, int) -{ - hiptensor_xfloat32 orig = a; - --a; - return orig; -} - -namespace std -{ - constexpr HIPTENSOR_HOST_DEVICE bool isinf(hiptensor_xfloat32 a) - { - union - { - float fp32; - uint32_t int32; - } u = {a.data}; - return !(~u.int32 & 0x7f800000) && !(u.int32 & 0x7fe000); - } - constexpr HIPTENSOR_HOST_DEVICE bool isnan(hiptensor_xfloat32 a) - { - union - { - float fp32; - uint32_t int32; - } u = {a.data}; - return !(~u.int32 & 0x7f800000) && +(u.int32 & 0x7fe000); - } - constexpr HIPTENSOR_HOST_DEVICE bool iszero(hiptensor_xfloat32 a) - { - union - { - float fp32; - uint32_t int32; - } u = {a.data}; - return (u.fp32 == 0.0f); - } - - HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 sin(hiptensor_xfloat32 a) - { - return hiptensor_xfloat32(sinf(float(a))); - } - HIPTENSOR_HOST_DEVICE inline hiptensor_xfloat32 cos(hiptensor_xfloat32 a) - { - return hiptensor_xfloat32(cosf(float(a))); - } - - HIPTENSOR_HOST_DEVICE constexpr hiptensor_xfloat32 real(const hiptensor_xfloat32& a) - { - return a; - } -} - -#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) - -#endif // HIPTENSOR_XFLOAT32_HPP diff --git a/library/src/permutation/permutation_cpu_reference_impl.hpp b/library/src/permutation/permutation_cpu_reference_impl.hpp index c1d4a3af..4820274f 100644 --- a/library/src/permutation/permutation_cpu_reference_impl.hpp +++ b/library/src/permutation/permutation_cpu_reference_impl.hpp @@ -92,7 +92,7 @@ namespace hiptensor auto bOffset = std::inner_product(bIndices.rbegin(), bIndices.rend(), bStrides.rbegin(), 0); #endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR - B[bOffset] = static_cast(A[elementIndex] * alphaValue); + B[bOffset] = static_cast(A[elementIndex] * (DataType)alphaValue); } return HIPTENSOR_STATUS_SUCCESS; diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index ada3ce61..d255c0e4 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -26,16 +26,65 @@ # Check whether building within hiptensor context if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) - add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32 simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) + add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) + add_hiptensor_sample(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_cf32_cf32_cf32_compute_cf32 simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp) + add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) # If building hipTensor samples as a standalone Cmake project else() + add_executable(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp) + target_link_libraries(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) - target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp) + target_link_libraries(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) - target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) + target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) + target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) + target_link_libraries(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp) + target_link_libraries(simple_scale_contraction_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) + target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) + target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor) endif() diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction.hpp similarity index 86% rename from samples/01_contraction/simple_bilinear_contraction_f32.cpp rename to samples/01_contraction/simple_bilinear_contraction.hpp index 5704a59d..95c5d0f6 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f32.cpp +++ b/samples/01_contraction/simple_bilinear_contraction.hpp @@ -28,37 +28,21 @@ #include #include #include -#include #include #include #include #include "common.hpp" -int main(int argc, char* argv[]) +template +int bilinearContractionSample(void* alpha, void* beta) { - /*************************************** - * Check device support * - **************************************/ - if(!isF32Supported()) - { - std::cout << "unsupported host device" << std::endl; - exit(EXIT_FAILURE); - } - - typedef float ADataType; - typedef float BDataType; - typedef float CDataType; - typedef float floatTypeCompute; - - hipDataType typeA = HIP_R_32F; - hipDataType typeB = HIP_R_32F; - hipDataType typeC = HIP_R_32F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.1f; - floatTypeCompute beta = (floatTypeCompute)1.0f; - /********************** * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * *C_{m,n,u,v} @@ -74,12 +58,12 @@ int main(int argc, char* argv[]) std::unordered_map extent; - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; + extent['m'] = 4; + extent['n'] = 3; + extent['u'] = 4; + extent['v'] = 3; + extent['h'] = 6; + extent['k'] = 5; std::vector c_ms_ns_lengths; for(auto mode : modeC) @@ -166,19 +150,41 @@ int main(int argc, char* argv[]) /******************* * Initialize data *******************/ + int initMethod = 1; // TODO read value from commandline for(int64_t i = 0; i < elementsA; i++) { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + A[i] = (ADataType)(float(i) / 100); + } } for(int64_t i = 0; i < elementsB; i++) { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + B[i] = (BDataType)(float(i) / 100); + } } for(int64_t i = 0; i < elementsC; i++) { - C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + C[i] = CDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + C[i] = (BDataType)(float(i) / 100); + } } /******************************************** @@ -193,7 +199,6 @@ int main(int argc, char* argv[]) /************************************************ * Retrieve the memory alignment for each tensor ************************************************/ - uint32_t alignmentRequirementA; CHECK_HIPTENSOR_ERROR( hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); @@ -262,27 +267,13 @@ int main(int argc, char* argv[]) std::cout << "Launching contraction kernel..." << std::endl; - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - (void*)&beta, - C_d, - C_d, - workspace, - worksize, - 0 /* stream */)); + CHECK_HIPTENSOR_ERROR(hiptensorContraction( + handle, &plan, alpha, A_d, B_d, beta, C_d, C_d, workspace, worksize, 0 /* stream */)); #if !NDEBUG bool printElements = false; bool storeElements = false; - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - if(printElements) { if(elementsA < MAX_ELEMENTS_PRINT_COUNT) @@ -305,6 +296,15 @@ int main(int argc, char* argv[]) hiptensorPrintArrayElements(std::cout, C, elementsC); std::cout << std::endl; } + + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } } if(storeElements) @@ -318,6 +318,12 @@ int main(int argc, char* argv[]) hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); tensorB.close(); + tensorC.open("tensor_C.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + tensorC.open("tensor_C_scale_contraction_results.txt"); hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); tensorC.close(); diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp new file mode 100644 index 00000000..52915200 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef hip_bfloat16 ADataType; + typedef hip_bfloat16 BDataType; + typedef hip_bfloat16 CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_16BF; + constexpr hipDataType typeB = HIP_R_16BF; + constexpr hipDataType typeC = HIP_R_16BF; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; + return bilinearContractionSample(&alpha, &beta); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp new file mode 100644 index 00000000..5b3bb7cc --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_cf32_cf32_cf32_cf32_compute_cf32.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef hipFloatComplex ADataType; + typedef hipFloatComplex BDataType; + typedef hipFloatComplex CDataType; + typedef hipFloatComplex floatTypeCompute; + + constexpr hipDataType typeA = HIP_C_32F; + constexpr hipDataType typeB = HIP_C_32F; + constexpr hipDataType typeC = HIP_C_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F; + + floatTypeCompute alpha{1.0f, 1.0f}; + floatTypeCompute beta{1.0f, 1.0f}; + return bilinearContractionSample(&alpha, &beta); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp new file mode 100644 index 00000000..8de0c534 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef _Float16 ADataType; + typedef _Float16 BDataType; + typedef _Float16 CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_16F; + constexpr hipDataType typeB = HIP_R_16F; + constexpr hipDataType typeC = HIP_R_16F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; + return bilinearContractionSample(&alpha, &beta); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp new file mode 100644 index 00000000..6ce6d3c0 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef hip_bfloat16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; + return bilinearContractionSample(&alpha, &beta); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp new file mode 100644 index 00000000..d4e28761 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef _Float16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; + return bilinearContractionSample(&alpha, &beta); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp new file mode 100644 index 00000000..e493f1c3 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; + return bilinearContractionSample(&alpha, &beta); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp new file mode 100644 index 00000000..0faffc3e --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeC = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; + return bilinearContractionSample(&alpha, &beta); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp new file mode 100644 index 00000000..d5024eba --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double CDataType; + typedef double floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeC = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + + floatTypeCompute alpha{1.0f}; + floatTypeCompute beta{1.0f}; + return bilinearContractionSample(&alpha, &beta); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction.hpp similarity index 91% rename from samples/01_contraction/simple_scale_contraction_f32.cpp rename to samples/01_contraction/simple_scale_contraction.hpp index c76ec370..5db4598d 100644 --- a/samples/01_contraction/simple_scale_contraction_f32.cpp +++ b/samples/01_contraction/simple_scale_contraction.hpp @@ -34,29 +34,15 @@ #include "common.hpp" -int main(int argc, char* argv[]) +template +int scaleContractionSample(void* alpha) { - /*************************************** - * Check device support * - **************************************/ - if(!isF32Supported()) - { - std::cout << "unsupported host device" << std::endl; - exit(EXIT_FAILURE); - } - - typedef float ADataType; - typedef float BDataType; - typedef float DDataType; - typedef float floatTypeCompute; - - hipDataType typeA = HIP_R_32F; - hipDataType typeB = HIP_R_32F; - hipDataType typeD = HIP_R_32F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.0f; - /********************** * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} **********************/ @@ -71,12 +57,12 @@ int main(int argc, char* argv[]) std::unordered_map extent; - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; + extent['m'] = 4; + extent['n'] = 3; + extent['u'] = 4; + extent['v'] = 3; + extent['h'] = 6; + extent['k'] = 5; std::vector d_ms_ns_lengths; for(auto mode : modeD) @@ -163,14 +149,29 @@ int main(int argc, char* argv[]) /******************* * Initialize data *******************/ + int initMethod = 1; // TODO read the value from command line for(int64_t i = 0; i < elementsA; i++) { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + A[i] = (ADataType)(float(i) / 100); + } } for(int64_t i = 0; i < elementsB; i++) { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + B[i] = (BDataType)(float(i) / 100); + } } for(int64_t i = 0; i < elementsD; i++) @@ -260,7 +261,7 @@ int main(int argc, char* argv[]) CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, &plan, - (void*)&alpha, + alpha, A_d, B_d, nullptr, @@ -270,8 +271,6 @@ int main(int argc, char* argv[]) worksize, 0 /* stream */)); - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - #if !NDEBUG bool printElements = false; bool storeElements = false; diff --git a/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp new file mode 100644 index 00000000..5a991dbc --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp @@ -0,0 +1,51 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + typedef hip_bfloat16 ADataType; + typedef hip_bfloat16 BDataType; + typedef hip_bfloat16 DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_16BF; + constexpr hipDataType typeB = HIP_R_16BF; + constexpr hipDataType typeD = HIP_R_16BF; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = 1; + return scaleContractionSample(&alpha); +} diff --git a/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp new file mode 100644 index 00000000..a3eb5e6f --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_cf32_cf32_cf32_compute_cf32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef hipFloatComplex ADataType; + typedef hipFloatComplex BDataType; + typedef hipFloatComplex DDataType; + typedef hipFloatComplex floatTypeCompute; + + constexpr hipDataType typeA = HIP_C_32F; + constexpr hipDataType typeB = HIP_C_32F; + constexpr hipDataType typeD = HIP_C_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_C32F; + + floatTypeCompute alpha(1, 1); + return scaleContractionSample(&alpha); +} diff --git a/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp new file mode 100644 index 00000000..9283283b --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef _Float16 ADataType; + typedef _Float16 BDataType; + typedef _Float16 DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_16F; + constexpr hipDataType typeB = HIP_R_16F; + constexpr hipDataType typeD = HIP_R_16F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = 1; + return scaleContractionSample(&alpha); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp new file mode 100644 index 00000000..dac5e18b --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef hip_bfloat16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + + floatTypeCompute alpha = floatTypeCompute{1.0f}; + return scaleContractionSample(&alpha); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp new file mode 100644 index 00000000..155f9585 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef _Float16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + + floatTypeCompute alpha = 1; + return scaleContractionSample(&alpha); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp new file mode 100644 index 00000000..2def291d --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = 1; + return scaleContractionSample(&alpha); +} diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp new file mode 100644 index 00000000..7b2a9c95 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF64Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeD = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = 1; + return scaleContractionSample(&alpha); +} diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp new file mode 100644 index 00000000..201741e9 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF64Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double DDataType; + typedef double floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeD = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + + floatTypeCompute alpha = 1; + return scaleContractionSample(&alpha); +} diff --git a/samples/02_permutation/CMakeLists.txt b/samples/02_permutation/CMakeLists.txt index 68857b54..ab66798c 100644 --- a/samples/02_permutation/CMakeLists.txt +++ b/samples/02_permutation/CMakeLists.txt @@ -26,7 +26,7 @@ # Check whether building within hiptensor context if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) - add_hiptensor_sample(permutation permutation.cpp) + add_hiptensor_sample(simple_permutation permutation.cpp) # If building hipTensor samples as a standalone Cmake project else() add_executable(permutation permutation.cpp) diff --git a/test/00_unit/yaml_test.cpp b/test/00_unit/yaml_test.cpp index 2efc6b6e..57a86a25 100644 --- a/test/00_unit/yaml_test.cpp +++ b/test/00_unit/yaml_test.cpp @@ -54,8 +54,8 @@ namespace hiptensor using LengthsT = std::vector; using StridesT = std::vector; - using AlphaT = double; - using BetaT = double; + using AlphaT = std::vector; + using BetaT = std::vector; //Data types of input and output tensors std::vector mDataTypes; @@ -79,9 +79,13 @@ int main(int argc, char* argv[]) yee.mDataTypes = { // clang-format off {HIP_R_32F, HIP_R_32F, hiptensor::NONE_TYPE, HIP_R_32F, HIP_R_32F}, // scale F32 + {HIP_C_32F, HIP_C_32F, hiptensor::NONE_TYPE, HIP_C_32F, HIP_C_32F}, // scale F32 Complex {HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F}, // bilinear F32 + {HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F}, // bilinear F32 Complex {HIP_R_64F, HIP_R_64F, hiptensor::NONE_TYPE, HIP_R_64F, HIP_R_64F}, // scale F64 + {HIP_C_64F, HIP_C_64F, hiptensor::NONE_TYPE, HIP_C_64F, HIP_C_64F}, // scale F64 Complex {HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F}, // bilinear F64 + {HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F}, // bilinear F64 Complex // clang-format on }; yee.mAlgorithms @@ -94,8 +98,8 @@ int main(int argc, char* argv[]) yee.mProblemLengths = {{5, 6, 7, 8, 4, 2, 3, 4}, {1, 2, 3, 4}, {99, 12, 44, 31, 59, 23, 54, 22}}; yee.mProblemStrides = {{}}; - yee.mAlphas = {0, 1, 1}; - yee.mBetas = {2, 2, 2}; + yee.mAlphas = {{0}, {1}, {1}}; + yee.mBetas = {{2}, {2}, {2}}; struct TmpFileWrapper { diff --git a/test/01_contraction/CMakeLists.txt b/test/01_contraction/CMakeLists.txt index fe2d7a87..a59eeefd 100644 --- a/test/01_contraction/CMakeLists.txt +++ b/test/01_contraction/CMakeLists.txt @@ -33,10 +33,20 @@ set (BilinearContractionTestSources ${ContractionCommonSources} set (BilinearContractionTestConfig ${CMAKE_CURRENT_SOURCE_DIR}/configs/bilinear_test_params.yaml) add_hiptensor_test(bilinear_contraction_test ${BilinearContractionTestConfig} ${BilinearContractionTestSources}) +# Complex Bilinear tests +set (ComplexBilinearContractionTestSources ${ContractionCommonSources} + ${CMAKE_CURRENT_SOURCE_DIR}/complex_bilinear_contraction_test.cpp) +set (ComplexBilinearContractionTestConfig ${CMAKE_CURRENT_SOURCE_DIR}/configs/complex_bilinear_test_params.yaml) +add_hiptensor_test(complex_bilinear_contraction_test ${ComplexBilinearContractionTestConfig} ${ComplexBilinearContractionTestSources}) + # Scale tests set (ScaleContractionTestSources ${ContractionCommonSources} ${CMAKE_CURRENT_SOURCE_DIR}/scale_contraction_test.cpp) set (ScaleContractionTestConfig ${CMAKE_CURRENT_SOURCE_DIR}/configs/scale_test_params.yaml) add_hiptensor_test(scale_contraction_test ${ScaleContractionTestConfig} ${ScaleContractionTestSources}) - +# Complex Scale tests +set (ComplexScaleContractionTestSources ${ContractionCommonSources} + ${CMAKE_CURRENT_SOURCE_DIR}/complex_scale_contraction_test.cpp) +set (ComplexScaleContractionTestConfig ${CMAKE_CURRENT_SOURCE_DIR}/configs/complex_scale_test_params.yaml) +add_hiptensor_test(complex_scale_contraction_test ${ComplexScaleContractionTestConfig} ${ComplexScaleContractionTestSources}) diff --git a/test/01_contraction/complex_bilinear_contraction_test.cpp b/test/01_contraction/complex_bilinear_contraction_test.cpp new file mode 100644 index 00000000..51e95c34 --- /dev/null +++ b/test/01_contraction/complex_bilinear_contraction_test.cpp @@ -0,0 +1,48 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include "contraction_test.hpp" +#include "contraction_test_helpers.hpp" + +class ComplexBilinearContractionTest : public hiptensor::ContractionTest +{ +}; + +TEST_P(ComplexBilinearContractionTest, RunKernel) +{ + static bool ranWarmup = false; + if(!ranWarmup) + { + this->Warmup(); + ranWarmup = true; + } + this->RunKernel(); +} + +INSTANTIATE_TEST_SUITE_P(ContractionTests, ComplexBilinearContractionTest, load_config_helper()); diff --git a/test/01_contraction/complex_scale_contraction_test.cpp b/test/01_contraction/complex_scale_contraction_test.cpp new file mode 100644 index 00000000..3995651b --- /dev/null +++ b/test/01_contraction/complex_scale_contraction_test.cpp @@ -0,0 +1,48 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include "contraction_test.hpp" +#include "contraction_test_helpers.hpp" + +class ComplexScaleContractionTest : public hiptensor::ContractionTest +{ +}; + +TEST_P(ComplexScaleContractionTest, RunKernel) +{ + static bool ranWarmup = false; + if(!ranWarmup) + { + this->Warmup(); + ranWarmup = true; + } + this->RunKernel(); +} + +INSTANTIATE_TEST_SUITE_P(ContractionTests, ComplexScaleContractionTest, load_config_helper()); diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index 2bd90e90..9306445a 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -1,8 +1,13 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: - - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F] - - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F] + - [ HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_16BF ] + - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_32F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT @@ -14,13 +19,13 @@ Worksize Prefs: - HIPTENSOR_WORKSPACE_MIN - HIPTENSOR_WORKSPACE_MAX Alphas: - - 0 - - 1 - - 1 + - [0] + - [1] + - [1] Betas: - - 2 - - 0 - - 2 + - [2] + - [0] + - [2] Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] diff --git a/test/01_contraction/configs/complex_bilinear_test_params.yaml b/test/01_contraction/configs/complex_bilinear_test_params.yaml new file mode 100644 index 00000000..dfbb814e --- /dev/null +++ b/test/01_contraction/configs/complex_bilinear_test_params.yaml @@ -0,0 +1,30 @@ +--- +Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] +Tensor Data Types: + - [ HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F ] + - [ HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F ] +Algorithm Types: + - HIPTENSOR_ALGO_DEFAULT + - HIPTENSOR_ALGO_DEFAULT_PATIENT + - HIPTENSOR_ALGO_ACTOR_CRITIC +Operators: + - HIPTENSOR_OP_IDENTITY +Worksize Prefs: + - HIPTENSOR_WORKSPACE_RECOMMENDED + - HIPTENSOR_WORKSPACE_MIN + - HIPTENSOR_WORKSPACE_MAX +Alphas: + - [0, 0] + - [1, 1] + - [1.1, 1.2] +Betas: + - [2, 2] + - [0, 0] + - [2.2, 2.3] +Lengths: + - [ 5, 6, 3, 4, 3, 4 ] + - [ 4, 3, 4, 3, 6, 5 ] + - [ 24, 18, 2, 4, 9, 1 ] +Strides: + - [] +... diff --git a/test/01_contraction/configs/complex_scale_test_params.yaml b/test/01_contraction/configs/complex_scale_test_params.yaml new file mode 100644 index 00000000..4bad2a9b --- /dev/null +++ b/test/01_contraction/configs/complex_scale_test_params.yaml @@ -0,0 +1,30 @@ +--- +Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] +Tensor Data Types: + - [ HIP_C_32F, HIP_C_32F, NONE_TYPE, HIP_C_32F, HIP_C_32F ] + - [ HIP_C_64F, HIP_C_64F, NONE_TYPE, HIP_C_64F, HIP_C_64F ] +Algorithm Types: + - HIPTENSOR_ALGO_DEFAULT + - HIPTENSOR_ALGO_DEFAULT_PATIENT + - HIPTENSOR_ALGO_ACTOR_CRITIC +Operators: + - HIPTENSOR_OP_IDENTITY +Worksize Prefs: + - HIPTENSOR_WORKSPACE_RECOMMENDED + - HIPTENSOR_WORKSPACE_MIN + - HIPTENSOR_WORKSPACE_MAX +Alphas: + - [0, 0] + - [1, 1] + - [1.1, 1.2] +Betas: + - [2, 2] + - [0, 0] + - [2.2, 2.3] +Lengths: + - [ 5, 6, 3, 4, 3, 4 ] + - [ 4, 3, 4, 3, 6, 5 ] + - [ 24, 18, 2, 4, 9, 1 ] +Strides: + - [] +... diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index 329f1b84..4c52eeda 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -1,8 +1,13 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: + - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT @@ -14,13 +19,13 @@ Worksize Prefs: - HIPTENSOR_WORKSPACE_MIN - HIPTENSOR_WORKSPACE_MAX Alphas: - - 0 - - 1 - - 1 + - [0] + - [1] + - [1] Betas: - - 2 - - 0 - - 2 + - [2] + - [0] + - [2] Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index 5d745d12..664da2ec 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -56,8 +56,10 @@ namespace hiptensor // False = skip test bool ContractionTest::checkDevice(hipDataType datatype) const { - return (isF32Supported() && datatype == HIP_R_32F) - || (isF64Supported() && datatype == HIP_R_64F); + return (isF32Supported() + && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF + || datatype == HIP_C_32F)) + || (isF64Supported() && (datatype == HIP_R_64F || datatype == HIP_C_64F)); } bool ContractionTest::checkSizes() const @@ -115,11 +117,23 @@ namespace hiptensor auto CDataType = testType[2]; auto DDataType = testType[3]; - EXPECT_TRUE((ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)); - EXPECT_TRUE((BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)); - EXPECT_TRUE((CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) + EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF) + || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F) + || (ADataType == HIP_C_32F) || (ADataType == HIP_C_64F)); + EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF) + || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F) + || (BDataType == HIP_C_32F) || (BDataType == HIP_C_64F)); + EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF) + || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) + || (CDataType == HIP_C_32F) || (CDataType == HIP_C_64F) || (CDataType == NONE_TYPE)); - EXPECT_TRUE((DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); + EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF) + || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F) + || (DDataType == HIP_C_32F) || (DDataType == HIP_C_64F)); + EXPECT_TRUE( + (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF) + || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F) + || (computeType == HIPTENSOR_COMPUTE_C32F) || (computeType == HIPTENSOR_COMPUTE_C64F)); mRunFlag &= checkDevice(DDataType); @@ -228,7 +242,35 @@ namespace hiptensor auto resource = getResource(); resource->resizeStorage(lengths, elementBytes); - if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F) + if(ADataType == HIP_R_16F && BDataType == HIP_R_16F && DDataType == HIP_R_16F) + { + // Initialize matrix data on device + fillLaunchKernel<_Float16>((_Float16*)resource->deviceA().get(), elementsA); + fillLaunchKernel<_Float16>((_Float16*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_R_16F) + { + fillLaunchKernel<_Float16>((_Float16*)resource->deviceC().get(), elementsCD); + } + fillValLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(), + elementsCD, + std::numeric_limits<_Float16>::signaling_NaN()); + } + else if(ADataType == HIP_R_16BF && BDataType == HIP_R_16BF && DDataType == HIP_R_16BF) + { + // Initialize matrix data on device + fillLaunchKernel((hip_bfloat16*)resource->deviceA().get(), elementsA); + fillLaunchKernel((hip_bfloat16*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_R_16BF) + { + fillLaunchKernel((hip_bfloat16*)resource->deviceC().get(), + elementsCD); + } + fillValLaunchKernel( + (hip_bfloat16*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); + } + else if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F) { // Initialize matrix data on device fillLaunchKernel((float*)resource->deviceA().get(), elementsA); @@ -254,6 +296,40 @@ namespace hiptensor elementsCD, std::numeric_limits::signaling_NaN()); } + else if(ADataType == HIP_C_32F && BDataType == HIP_C_32F && DDataType == HIP_C_32F) + { + // Initialize matrix data on device + fillLaunchKernel((hipFloatComplex*)resource->deviceA().get(), + elementsA); + fillLaunchKernel((hipFloatComplex*)resource->deviceB().get(), + elementsB); + if(CDataType == HIP_C_32F) + { + fillLaunchKernel((hipFloatComplex*)resource->deviceC().get(), + elementsCD); + } + fillValLaunchKernel( + (hipFloatComplex*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); + } + else if(ADataType == HIP_C_64F && BDataType == HIP_C_64F && DDataType == HIP_C_64F) + { + // Initialize matrix data on device + fillLaunchKernel((hipDoubleComplex*)resource->deviceA().get(), + elementsA); + fillLaunchKernel((hipDoubleComplex*)resource->deviceB().get(), + elementsB); + if(CDataType == HIP_C_64F) + { + fillLaunchKernel((hipDoubleComplex*)resource->deviceC().get(), + elementsCD); + } + fillValLaunchKernel( + (hipDoubleComplex*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); + } resource->copyDeviceToHostAll(elementBytes); @@ -328,7 +404,7 @@ namespace hiptensor { auto resource = getResource(); - int size = ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double)); + int size = hipDataTypeSize(DDataType); size_t elementsA = std::accumulate(a_ms_ks.mLengths.begin(), a_ms_ks.mLengths.end(), @@ -346,7 +422,50 @@ namespace hiptensor auto D = resource->allocHost(elementsCD * size); resource->copyData(D, resource->deviceD(), elementsCD * size); - if(DDataType == HIP_R_32F) + if(DDataType == HIP_R_16F) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements<_Float16>(stream, (_Float16*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_R_16BF) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_R_32F) { stream << "Tensor A elements:\n"; hiptensorPrintArrayElements( @@ -367,7 +486,7 @@ namespace hiptensor hiptensorPrintArrayElements(stream, (float*)D.get(), elementsCD); stream << std::endl; } - else + else if(DDataType == HIP_R_64F) { stream << "Tensor A elements:\n"; hiptensorPrintArrayElements( @@ -388,6 +507,50 @@ namespace hiptensor hiptensorPrintArrayElements(stream, (double*)D.get(), elementsCD); stream << std::endl; } + else if(DDataType == HIP_C_32F) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements( + stream, (hipFloatComplex*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements( + stream, (hipFloatComplex*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements( + stream, (hipFloatComplex*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements( + stream, (hipFloatComplex*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_C_64F) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements( + stream, (hipDoubleComplex*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements( + stream, (hipDoubleComplex*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements( + stream, (hipDoubleComplex*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements( + stream, (hipDoubleComplex*)D.get(), elementsCD); + stream << std::endl; + } } } } @@ -414,6 +577,19 @@ namespace hiptensor auto computeType = convertToComputeType(testType[4]); + /* + * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha. + * ``` + * alphaF = hiptensor::readVal( + * alpha, convertToComputeType(HipDataType_v)); + * ``` + * Hence, the `alpha` and `bete` need to point to a ComputeData value + */ + ScalarData alphaBuf; + ScalarData betaBuf; + writeVal(&alphaBuf, computeType, ScalarData(computeType, alpha[0], alpha[1])); + writeVal(&betaBuf, computeType, ScalarData(computeType, beta[0], beta[1])); + CHECK_HIPTENSOR_ERROR( hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); @@ -421,20 +597,21 @@ namespace hiptensor CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, &plan, - (void*)&alpha, + (void*)&alphaBuf, resource->deviceA().get(), resource->deviceB().get(), - (void*)&beta, + (void*)&betaBuf, resource->deviceC().get(), resource->deviceD().get(), workspace, worksize, 0 /* stream */)); - CHECK_HIPTENSOR_ERROR(hiptensorContractionReference((void*)&alpha, + CHECK_HIPTENSOR_ERROR(hiptensorContractionReference(&plan, + (void*)&alphaBuf, resource->hostA().get(), resource->hostB().get(), - (void*)&beta, + (void*)&betaBuf, resource->hostC().get(), resource->hostD().get(), a_ms_ks.mLengths, @@ -451,24 +628,47 @@ namespace hiptensor DDataType, workspace)); - size_t elementsCD = std::accumulate(c_ms_ns.mLengths.begin(), - c_ms_ns.mLengths.end(), + size_t elementsCD = std::accumulate(d_ms_ns.mLengths.begin(), + d_ms_ns.mLengths.end(), size_t{1}, std::multiplies()); - int sizeD = elementsCD * ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double)); + int sizeD = elementsCD * hipDataTypeSize(DDataType); auto reference = resource->allocDevice(sizeD); resource->copyData(reference, resource->hostD(), sizeD); - if(DDataType == HIP_R_32F) + if(DDataType == HIP_R_16F) + { + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(), + (_Float16*)reference.get(), + elementsCD, + computeType); + } + else if(DDataType == HIP_R_16BF) + { + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel( + (hip_bfloat16*)resource->deviceD().get(), + (hip_bfloat16*)reference.get(), + elementsCD, + computeType); + } + else if(DDataType == HIP_R_32F || DDataType == HIP_C_32F) { - std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( - (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD); + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel((float*)resource->deviceD().get(), + (float*)reference.get(), + elementsCD, + computeType); } - else if(DDataType == HIP_R_64F) + else if(DDataType == HIP_R_64F || DDataType == HIP_C_64F) { - std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( - (double*)resource->deviceD().get(), (double*)reference.get(), elementsCD); + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel((double*)resource->deviceD().get(), + (double*)reference.get(), + elementsCD, + computeType); } EXPECT_TRUE(mValidationResult) << "Max relative error: " << mMaxRelativeError; diff --git a/test/01_contraction/contraction_test_params.hpp b/test/01_contraction/contraction_test_params.hpp index 29c4aa1b..4db4ebc1 100644 --- a/test/01_contraction/contraction_test_params.hpp +++ b/test/01_contraction/contraction_test_params.hpp @@ -49,8 +49,8 @@ namespace hiptensor using LengthsT = std::vector; using StridesT = std::vector; - using AlphaT = double; - using BetaT = double; + using AlphaT = std::vector; + using BetaT = std::vector; public: std::vector& dataTypes() diff --git a/test/02_permutation/CMakeLists.txt b/test/02_permutation/CMakeLists.txt index 4334901c..bb2796ea 100644 --- a/test/02_permutation/CMakeLists.txt +++ b/test/02_permutation/CMakeLists.txt @@ -29,7 +29,10 @@ set(PermutationCommonSources ${HIPTENSOR_COMMON_TEST_SOURCES} # tests set (PermutationTestSources ${PermutationCommonSources} - ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/permutation_column_major_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/permutation_cpu_impl_test.cpp + ) + set (PermutationTestConfig ${CMAKE_CURRENT_SOURCE_DIR}/configs/test_params.yaml) add_hiptensor_test(permutation_test ${PermutationTestConfig} ${PermutationTestSources}) diff --git a/test/02_permutation/permutation_cpu_impl_test.cpp b/test/02_permutation/permutation_cpu_impl_test.cpp new file mode 100644 index 00000000..5a885f0b --- /dev/null +++ b/test/02_permutation/permutation_cpu_impl_test.cpp @@ -0,0 +1,163 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include + +#include "data_types.hpp" +#include "logger.hpp" +#include "permutation/permutation_cpu_reference.hpp" +#include "permutation_test.hpp" +#include "utils.hpp" +#include "llvm/hiptensor_options.hpp" + +template +auto permuteWithCpu(hipDataType typeA, hipDataType typeB, hipDataType typeCompute) +{ + std::vector modeA{'w', 'h', 'c', 'n'}; + std::vector modeB{'c', 'n', 'h', 'w'}; + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + + std::unordered_map extent; + extent['h'] = 2; + extent['w'] = 3; + extent['c'] = 4; + extent['n'] = 5; + + std::vector extentA; + for(auto mode : modeA) + { + extentA.push_back(extent[mode]); + } + std::vector extentB; + for(auto mode : modeB) + { + extentB.push_back(extent[mode]); + } + + /********************** + * Allocating data + **********************/ + + size_t elementsA = 1; + for(auto mode : modeA) + { + elementsA *= extent[mode]; + } + size_t elementsB = 1; + for(auto mode : modeB) + { + elementsB *= extent[mode]; + } + + size_t sizeA = sizeof(floatTypeA) * elementsA; + size_t sizeB = sizeof(floatTypeB) * elementsB; + + std::vector aArray(elementsA); + std::vector bArray(elementsB); + std::iota(aArray.begin(), aArray.end(), 0); + +#if HIPTENSOR_DATA_LAYOUT_COL_MAJOR + std::vector referenceArray + = {0., 12.6, 25.2, 37.8, 50.4, 63., 75.6, 88.2, 100.8, 113.4, 126., 138.6, + 151.2, 163.8, 176.4, 189., 201.6, 214.2, 226.8, 239.4, 6.3, 18.9, 31.5, 44.1, + 56.7, 69.3, 81.9, 94.5, 107.1, 119.7, 132.3, 144.9, 157.5, 170.1, 182.7, 195.3, + 207.9, 220.5, 233.1, 245.7, 2.1, 14.7, 27.3, 39.9, 52.5, 65.1, 77.7, 90.3, + 102.9, 115.5, 128.1, 140.7, 153.3, 165.9, 178.5, 191.1, 203.7, 216.3, 228.9, 241.5, + 8.4, 21., 33.6, 46.2, 58.8, 71.4, 84., 96.6, 109.2, 121.8, 134.4, 147., + 159.6, 172.2, 184.8, 197.4, 210., 222.6, 235.2, 247.8, 4.2, 16.8, 29.4, 42., + 54.6, 67.2, 79.8, 92.4, 105., 117.6, 130.2, 142.8, 155.4, 168., 180.6, 193.2, + 205.8, 218.4, 231., 243.6, 10.5, 23.1, 35.7, 48.3, 60.9, 73.5, 86.1, 98.7, + 111.3, 123.9, 136.5, 149.1, 161.7, 174.3, 186.9, 199.5, 212.1, 224.7, 237.3, 249.9}; +#else // HIPTENSOR_DATA_LAYOUT_COL_MAJOR + std::vector referenceArray + = {0., 84., 168., 42., 126., 210., 2.1, 86.1, 170.1, 44.1, 128.1, 212.1, + 4.2, 88.2, 172.2, 46.2, 130.2, 214.2, 6.3, 90.3, 174.3, 48.3, 132.3, 216.3, + 8.4, 92.4, 176.4, 50.4, 134.4, 218.4, 10.5, 94.5, 178.5, 52.5, 136.5, 220.5, + 12.6, 96.6, 180.6, 54.6, 138.6, 222.6, 14.7, 98.7, 182.7, 56.7, 140.7, 224.7, + 16.8, 100.8, 184.8, 58.8, 142.8, 226.8, 18.9, 102.9, 186.9, 60.9, 144.9, 228.9, + 21., 105., 189., 63., 147., 231., 23.1, 107.1, 191.1, 65.1, 149.1, 233.1, + 25.2, 109.2, 193.2, 67.2, 151.2, 235.2, 27.3, 111.3, 195.3, 69.3, 153.3, 237.3, + 29.4, 113.4, 197.4, 71.4, 155.4, 239.4, 31.5, 115.5, 199.5, 73.5, 157.5, 241.5, + 33.6, 117.6, 201.6, 75.6, 159.6, 243.6, 35.7, 119.7, 203.7, 77.7, 161.7, 245.7, + 37.8, 121.8, 205.8, 79.8, 163.8, 247.8, 39.9, 123.9, 207.9, 81.9, 165.9, 249.9}; + +#endif // HIPTENSOR_DATA_LAYOUT_COL_MAJOR + + const floatTypeCompute alphaValue = 2.1f; + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + hiptensorTensorDescriptor_t descA; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor( + handle, &descA, nmodeA, extentA.data(), NULL /* stride */, typeA, HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t descB; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor( + handle, &descB, nmodeB, extentB.data(), NULL /* stride */, typeB, HIPTENSOR_OP_IDENTITY)); + + hiptensor::detail::permuteByCpu(&alphaValue, + aArray.data(), + &descA, + modeA.data(), + bArray.data(), + &descB, + modeB.data(), + typeCompute); + return compareEqual(referenceArray.data(), + bArray.data(), + bArray.size(), + hiptensor::convertToComputeType(typeCompute), + 10); +} + +TEST(PermutationCpuImplTest, CompareF32ResultWithReference) +{ + typedef float floatTypeA; + typedef float floatTypeB; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_32F; + hipDataType typeB = HIP_R_32F; + hipDataType typeCompute = HIP_R_32F; + + auto [result, maxRelativeError] + = permuteWithCpu(typeA, typeB, typeCompute); + EXPECT_TRUE(result) << "max_relative_error: " << maxRelativeError; +} + +TEST(PermutationCpuImplTest, CompareF16ResultWithReference) +{ + typedef _Float16 floatTypeA; + typedef _Float16 floatTypeB; + typedef _Float16 floatTypeCompute; + + hipDataType typeA = HIP_R_16F; + hipDataType typeB = HIP_R_16F; + hipDataType typeCompute = HIP_R_16F; + + auto [result, maxRelativeError] + = permuteWithCpu(typeA, typeB, typeCompute); + EXPECT_TRUE(result) << "max_relative_error: " << maxRelativeError; +} diff --git a/test/02_permutation/permutation_resource.cpp b/test/02_permutation/permutation_resource.cpp index 1f448ff8..6acd7577 100644 --- a/test/02_permutation/permutation_resource.cpp +++ b/test/02_permutation/permutation_resource.cpp @@ -72,7 +72,7 @@ namespace hiptensor mCurrentAllocByte = requiredMemorySize; needFillData = true; } - else if(mCurrentDataType != dataType) + if(mCurrentDataType != dataType || mCurrentMatrixElement < requiredElementCount) { needFillData = true; } diff --git a/test/02_permutation/permutation_test.cpp b/test/02_permutation/permutation_test.cpp index cfadf5c0..078c78a4 100644 --- a/test/02_permutation/permutation_test.cpp +++ b/test/02_permutation/permutation_test.cpp @@ -257,7 +257,8 @@ namespace hiptensor std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel((float*)resource->deviceB().get(), (float*)resource->deviceReference().get(), - resource->getCurrentMatrixElement()); + resource->getCurrentMatrixElement(), + convertToComputeType(computeDataType)); } else if(abDataType == HIP_R_16F) { @@ -273,7 +274,8 @@ namespace hiptensor std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>( (_Float16*)resource->deviceB().get(), (_Float16*)resource->deviceReference().get(), - resource->getCurrentMatrixElement()); + resource->getCurrentMatrixElement(), + convertToComputeType(computeDataType)); } } diff --git a/test/device/common.hpp b/test/device/common.hpp index f961abc1..283a9035 100644 --- a/test/device/common.hpp +++ b/test/device/common.hpp @@ -72,8 +72,21 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed) if(index < elementSize) { - auto value = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize; - data[index] = static_cast(value); + if constexpr(std::is_same_v) + { + auto value = (float(index / float(RAND_MAX) - 0.5) * 100) / elementSize; + data[index] = make_hipFloatComplex(value, value); + } + else if constexpr(std::is_same_v) + { + auto value = (double(index / double(RAND_MAX) - 0.5) * 100) / elementSize; + data[index] = make_hipDoubleComplex(value, value); + } + else + { + auto value = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize; + data[index] = static_cast(value); + } } } diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp index 46f4c43e..8b504b01 100644 --- a/test/llvm/yaml_parser_config.cpp +++ b/test/llvm/yaml_parser_config.cpp @@ -92,6 +92,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(hiptensorOperator_t) LLVM_YAML_IS_SEQUENCE_VECTOR(hiptensorWorksizePreference_t) LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector) LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector) +LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector) LLVM_YAML_IS_SEQUENCE_VECTOR(AlphaT) LLVM_YAML_IS_SEQUENCE_VECTOR(BetaT) @@ -110,8 +111,11 @@ namespace llvm static void enumeration(IO& io, hipDataType& value) { io.enumCase(value, "HIP_R_16F", HIP_R_16F); + io.enumCase(value, "HIP_R_16BF", HIP_R_16BF); io.enumCase(value, "HIP_R_32F", HIP_R_32F); io.enumCase(value, "HIP_R_64F", HIP_R_64F); + io.enumCase(value, "HIP_C_32F", HIP_C_32F); + io.enumCase(value, "HIP_C_64F", HIP_C_64F); io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE); } }; @@ -226,10 +230,10 @@ namespace llvm io.mapRequired("Algorithm Types", doc.algorithms()); io.mapRequired("Operators", doc.operators()); io.mapRequired("Worksize Prefs", doc.workSizePrefrences()); - io.mapRequired("Alphas", (std::vector&)(doc.alphas())); + io.mapOptional("Alphas", (std::vector>&)(doc.alphas())); io.mapOptional("Betas", - (std::vector&)(doc.betas()), - std::vector(doc.alphas().size(), BetaT(0))); + (std::vector>&)(doc.betas()), + std::vector>(doc.alphas().size())); io.mapRequired("Lengths", doc.problemLengths()); // Default values for optional values @@ -256,6 +260,13 @@ namespace llvm return "Error: Empty Alphas"; } + if(std::any_of(doc.alphas().cbegin(), doc.alphas().cend(), [](auto&& alpha) { + return alpha.size() > 2 || alpha.size() <= 0; + })) + { + return "Error: invalid Alpha"; + } + if(doc.betas().size() > 0 && doc.betas().size() != doc.alphas().size()) { return "Error: Alphas and betas must have same size"; diff --git a/test/utils.hpp b/test/utils.hpp index 1f7ece44..fc999738 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -41,9 +41,9 @@ #include #include #include +#include #include "device/common.hpp" -#include "types.hpp" #define HIPTENSOR_FREE_DEVICE(ptr) \ if(ptr != nullptr) \ @@ -57,6 +57,59 @@ CHECK_HIP_ERROR(hipHostFree(ptr)); \ } +inline double getEpsilon(hiptensorComputeType_t id) +{ + auto toDouble = [](auto const& val) { return static_cast(static_cast(val)); }; + + if(id == HIPTENSOR_COMPUTE_16F) + { + return toDouble(std::numeric_limits<_Float16>::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_16BF) + { + return toDouble(std::numeric_limits::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_32F) + { + return toDouble(std::numeric_limits::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_64F) + { + return toDouble(std::numeric_limits::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_8U) + { + return 0; + } + else if(id == HIPTENSOR_COMPUTE_8I) + { + return 0; + } + else if(id == HIPTENSOR_COMPUTE_32U) + { + return 0; + } + else if(id == HIPTENSOR_COMPUTE_32I) + { + return 0; + } + else if(id == HIPTENSOR_COMPUTE_C32F) + { + return toDouble(std::numeric_limits::epsilon()); + } + else if(id == HIPTENSOR_COMPUTE_C64F) + { + return toDouble(std::numeric_limits::epsilon()); + } + else + { +#if !NDEBUG + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; +#endif // !NDEBUG + return 0; + } +} + inline bool isF32Supported() { hipDevice_t mHandle; @@ -137,10 +190,11 @@ __host__ static inline void } template -std::pair compareEqual(DDataType const* deviceD, - DDataType const* hostD, - std::size_t elementsD, - double tolerance = 100.0) +std::pair compareEqual(DDataType const* deviceD, + DDataType const* hostD, + std::size_t elementsD, + hiptensorComputeType_t computeType, + double tolerance = 100.0) { bool retval = true; double max_relative_error = 0.0; @@ -191,7 +245,7 @@ std::pair compareEqual(DDataType const* deviceD, } } - auto eps = toDouble(std::numeric_limits::epsilon()); + auto eps = getEpsilon(computeType); if(isInf) { retval = false; @@ -211,10 +265,11 @@ std::pair compareEqual(DDataType const* deviceD, } template -std::pair compareEqualLaunchKernel(DDataType* deviceD, - DDataType* hostD, - std::size_t elementsD, - double tolerance = 100.0) +std::pair compareEqualLaunchKernel(DDataType* deviceD, + DDataType* hostD, + std::size_t elementsD, + hiptensorComputeType_t computeType, + double tolerance = 100.0) { auto blockDim = dim3(1024, 1, 1); auto gridDim = dim3(ceilDiv(elementsD, blockDim.x), 1, 1); @@ -276,7 +331,7 @@ std::pair compareEqualLaunchKernel(DDataType* deviceD, auto toDouble = [](DDataType const& val) { return static_cast(static_cast(val)); }; - auto eps = toDouble(std::numeric_limits::epsilon()); + auto eps = getEpsilon(computeType); if(isNaN) { retval = false;