From c5fbcec9afdfe109b1ba1a15d74d819beec2e6fe Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 22 Nov 2023 16:21:33 +0000 Subject: [PATCH 01/12] Add support to f16 and bf16 to contraction - Support _Float16 - Support hip_bfloat16 - Add unit test of _Float16 and hip_bfloat16 - Add sample of _Float16 and hip_bfloat16 --- .../hiptensor/internal}/config.hpp | 0 .../hiptensor/internal/hiptensor_utility.hpp | 1 + .../hiptensor/internal}/native_types.hpp | 0 .../hiptensor/internal}/native_types_impl.hpp | 0 .../hiptensor/internal}/type_traits.hpp | 0 .../hiptensor/internal}/types.hpp | 0 .../hiptensor/internal}/types_ext.hpp | 0 .../hiptensor/internal}/xfloat32.hpp | 0 .../contraction_cpu_reference_impl.hpp | 19 +- .../contraction_cpu_reference_instances.cpp | 56 +++ .../contraction/contraction_meta_traits.hpp | 47 ++- .../src/contraction/contraction_selection.cpp | 280 +++++++++++++- .../src/contraction/contraction_solution.hpp | 3 +- .../contraction/contraction_solution_impl.hpp | 6 +- .../contraction_solution_instances.cpp | 57 +++ library/src/contraction/device/CMakeLists.txt | 56 ++- ...16_bf16_bf16_compute_f32_kknn_instance.cpp | 62 ++++ ...16_bf16_bf16_compute_f32_knnn_instance.cpp | 62 ++++ ...16_bf16_bf16_compute_f32_mknn_instance.cpp | 62 ++++ ...16_bf16_bf16_compute_f32_mnnn_instance.cpp | 62 ++++ ..._f16_f16_f16_compute_f32_kknn_instance.cpp | 62 ++++ ..._f16_f16_f16_compute_f32_knnn_instance.cpp | 62 ++++ ..._f16_f16_f16_compute_f32_mknn_instance.cpp | 62 ++++ ..._f16_f16_f16_compute_f32_mnnn_instance.cpp | 62 ++++ ...f16_bf16_bf16_compute_f32_kkn_instance.cpp | 62 ++++ ...f16_bf16_bf16_compute_f32_knn_instance.cpp | 62 ++++ ...f16_bf16_bf16_compute_f32_mkn_instance.cpp | 62 ++++ ...f16_bf16_bf16_compute_f32_mnn_instance.cpp | 62 ++++ ...e_f16_f16_f16_compute_f32_kkn_instance.cpp | 62 ++++ ...e_f16_f16_f16_compute_f32_knn_instance.cpp | 62 ++++ ...e_f16_f16_f16_compute_f32_mkn_instance.cpp | 62 ++++ ...e_f16_f16_f16_compute_f32_mnn_instance.cpp | 62 ++++ ...hpp => hiptensor_contraction_bilinear.hpp} | 0 ...le.hpp => hiptensor_contraction_scale.hpp} | 0 .../src/contraction/hiptensor_contraction.cpp | 11 - library/src/hiptensor.cpp | 3 +- samples/01_contraction/CMakeLists.txt | 15 + .../simple_bilinear_contraction_bf16.cpp | 342 ++++++++++++++++++ .../simple_bilinear_contraction_f16.cpp | 342 ++++++++++++++++++ .../simple_scale_contraction_bf16.cpp | 334 +++++++++++++++++ .../simple_scale_contraction_f16.cpp | 334 +++++++++++++++++ .../configs/bilinear_test_params.yaml | 2 + .../configs/scale_test_params.yaml | 2 + test/01_contraction/contraction_test.cpp | 109 +++++- test/device/common.hpp | 2 +- test/llvm/yaml_parser_config.cpp | 1 + test/utils.hpp | 2 +- 47 files changed, 2944 insertions(+), 72 deletions(-) rename library/{src/include => include/hiptensor/internal}/config.hpp (100%) rename library/{src/include => include/hiptensor/internal}/native_types.hpp (100%) rename library/{src/include => include/hiptensor/internal}/native_types_impl.hpp (100%) rename library/{src/include => include/hiptensor/internal}/type_traits.hpp (100%) rename library/{src/include => include/hiptensor/internal}/types.hpp (100%) rename library/{src/include => include/hiptensor/internal}/types_ext.hpp (100%) rename library/{src/include => include/hiptensor/internal}/xfloat32.hpp (100%) create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp rename library/src/contraction/device/{contraction_bilinear.hpp => hiptensor_contraction_bilinear.hpp} (100%) rename library/src/contraction/device/{contraction_scale.hpp => hiptensor_contraction_scale.hpp} (100%) create mode 100644 samples/01_contraction/simple_bilinear_contraction_bf16.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f16.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_bf16.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_f16.cpp diff --git a/library/src/include/config.hpp b/library/include/hiptensor/internal/config.hpp similarity index 100% rename from library/src/include/config.hpp rename to library/include/hiptensor/internal/config.hpp diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp index f2df2dd2..c386bbe0 100644 --- a/library/include/hiptensor/internal/hiptensor_utility.hpp +++ b/library/include/hiptensor/internal/hiptensor_utility.hpp @@ -31,6 +31,7 @@ #include #include "../hiptensor_types.hpp" +#include "types_ext.hpp" #ifndef CHECK_HIP_ERROR #define CHECK_HIP_ERROR(expression) \ diff --git a/library/src/include/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp similarity index 100% rename from library/src/include/native_types.hpp rename to library/include/hiptensor/internal/native_types.hpp diff --git a/library/src/include/native_types_impl.hpp b/library/include/hiptensor/internal/native_types_impl.hpp similarity index 100% rename from library/src/include/native_types_impl.hpp rename to library/include/hiptensor/internal/native_types_impl.hpp diff --git a/library/src/include/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp similarity index 100% rename from library/src/include/type_traits.hpp rename to library/include/hiptensor/internal/type_traits.hpp diff --git a/library/src/include/types.hpp b/library/include/hiptensor/internal/types.hpp similarity index 100% rename from library/src/include/types.hpp rename to library/include/hiptensor/internal/types.hpp diff --git a/library/src/include/types_ext.hpp b/library/include/hiptensor/internal/types_ext.hpp similarity index 100% rename from library/src/include/types_ext.hpp rename to library/include/hiptensor/internal/types_ext.hpp diff --git a/library/src/include/xfloat32.hpp b/library/include/hiptensor/internal/xfloat32.hpp similarity index 100% rename from library/src/include/xfloat32.hpp rename to library/include/hiptensor/internal/xfloat32.hpp diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index 673f6dff..ac4fc20d 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -57,6 +57,7 @@ namespace hiptensor typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, + typename ComputeDataType = ADataType, ck::enable_if_t = false> @@ -70,7 +71,8 @@ namespace hiptensor EDataType, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation> + CDEElementwiseOperation, + ComputeDataType> { using BaseArgument = ck::tensor_operation::device::BaseArgument; using BaseInvoker = ck::tensor_operation::device::BaseInvoker; @@ -324,7 +326,8 @@ namespace hiptensor typename AccumDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + CDEElementwiseOperation, + ComputeDataType>> : public MetaTraits< ck::tensor_operation::device::DeviceContractionMultipleD> + CDEElementwiseOperation, + ComputeDataType>> { }; @@ -359,7 +364,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType = ADataType> auto enumerateReferenceSolutions() { using ReferenceOp = ReferenceContraction_M2_N2_K2; + CDEElementwiseOperation, + ComputeDataType>; auto solution = std::make_unique>( std::make_unique()); diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 106dd5ff..146d2721 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -32,6 +32,34 @@ namespace hiptensor ContractionCpuReferenceInstances::ContractionCpuReferenceInstances() { // Register all the solutions exactly once + // Bilinear f16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + _Float16, + _Float16, + ck::Tuple<_Float16>, + _Float16, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear bf16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f32 registerSolutions( enumerateReferenceSolutions<2, @@ -58,6 +86,34 @@ namespace hiptensor ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Bilinear>()); + // Scale f16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + _Float16, + _Float16, + ck::Tuple<>, + _Float16, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale bf16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple<>, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + // Scale f32 registerSolutions( enumerateReferenceSolutions<2, diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index 4fa7acf7..ab158f96 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -49,7 +49,8 @@ namespace hiptensor typename DsDataType, typename EDataType, typename AElementwiseOperation, - typename BElementwiseOperation> + typename BElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + ck::tensor_operation::element_wise::Bilinear, + ComputeDataType>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; - using ADataT = ADataType; - using BDataT = BDataType; - using DDataT = DsDataType; - using EDataT = EDataType; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Bilinear; + using ADataT + = std::conditional_t, hip_bfloat16, ADataType>; + using BDataT + = std::conditional_t, hip_bfloat16, BDataType>; + using DDataT + = std::conditional_t, hip_bfloat16, DsDataType>; + using EDataT + = std::conditional_t, hip_bfloat16, EDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Bilinear; }; // Partial specialize for Scale contraction @@ -82,7 +88,8 @@ namespace hiptensor typename BDataType, typename EDataType, typename AElementwiseOperation, - typename BElementwiseOperation> + typename BElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + ck::tensor_operation::element_wise::Scale, + ComputeDataType>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; - using ADataT = ADataType; - using BDataT = BDataType; - using DDataT = NoneType; - using EDataT = EDataType; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Scale; + using ADataT + = std::conditional_t, hip_bfloat16, ADataType>; + using BDataT + = std::conditional_t, hip_bfloat16, BDataType>; + using DDataT = NoneType; + using EDataT + = std::conditional_t, hip_bfloat16, EDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Scale; }; } // namespace hiptensor diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index aaa624f6..1b2cf92e 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -150,6 +150,192 @@ namespace hiptensor } } + // test + template <> + struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::SCALE> + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 7255639152084218514; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::BILINEAR> + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // select unique_id + unique_id = 7255639152084218514; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 8689089455041651212; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // select unique_id + unique_id = 8689089455041651212; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + // end test + template <> struct ActorCriticSelection { @@ -1418,7 +1604,99 @@ namespace hiptensor std::vector const& e_ms_ns_strides, const uint64_t workspaceSize) { - if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F) + if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F) + { + return ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::SCALE>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F + && typeE == HIP_R_16F) + { + return ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::BILINEAR>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE + && typeE == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF + && typeE == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE + && typeE == HIP_R_32F) { return ActorCriticSelection:: selectWinner(winner, diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp index 0037584e..e76bb351 100644 --- a/library/src/contraction/contraction_solution.hpp +++ b/library/src/contraction/contraction_solution.hpp @@ -147,7 +147,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType> std::vector> enumerateContractionSolutions(); } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 0fb5df9d..5e191441 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -274,7 +274,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType = ADataType> std::vector> enumerateContractionSolutions() { using ContractionOp @@ -287,7 +288,8 @@ namespace hiptensor EDataType, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation>; + CDEElementwiseOperation, + ComputeDataType>; using Factory = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory; diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index fd263a8b..6d481577 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -32,6 +32,35 @@ namespace hiptensor ContractionSolutionInstances::ContractionSolutionInstances() { // Register all the solutions exactly once + + // Bilinear bf16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear f16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + ck::Tuple, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f32 registerSolutions( enumerateContractionSolutions<2, @@ -58,6 +87,34 @@ namespace hiptensor ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Bilinear>()); + // Scale bf16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple<>, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale f16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + ck::Tuple<>, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + // Scale f32 registerSolutions( enumerateContractionSolutions<2, diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index f2e4a0fb..b9b382c0 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -24,24 +24,40 @@ # ############################################################################### -set(CK_CONTRACTION_INSTANCE_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp -) + set(CK_CONTRACTION_INSTANCE_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp + ) -add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) -target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) + add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) + target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..7d777a83 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..a9a97148 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..d83d8d16 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..bc49c82b --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..a9d963ab --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..c139942e --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..3c6ced30 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..33c66296 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..05400151 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..bba95b14 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..fb5ecec0 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..1dd6613c --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..e98aee20 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..db8de1c0 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..397ef327 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..1f9221dc --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear.hpp similarity index 100% rename from library/src/contraction/device/contraction_bilinear.hpp rename to library/src/contraction/device/hiptensor_contraction_bilinear.hpp diff --git a/library/src/contraction/device/contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale.hpp similarity index 100% rename from library/src/contraction/device/contraction_scale.hpp rename to library/src/contraction/device/hiptensor_contraction_scale.hpp diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index 09f5ddf6..b96a204e 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -708,17 +708,6 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, return errorCode; } - if(plan->mContractionDesc.mComputeType != plan->mContractionDesc.mTensorDesc[3].mType) - { - auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; - snprintf(msg, - sizeof(msg), - "Internal Error : compute type != D type (%s)", - hiptensorGetErrorString(errorCode)); - logger->logError("hiptensorContraction", msg); - return errorCode; - } - auto* cSolution = (hiptensor::ContractionSolution*)(plan->mSolution); auto canRun = cSolution->initArgs(alpha, diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp index 9740d2a8..51af1f48 100644 --- a/library/src/hiptensor.cpp +++ b/library/src/hiptensor.cpp @@ -152,7 +152,8 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t* han } if((lens == nullptr) - || ((dataType != HIP_R_16F) && (dataType != HIP_R_32F) && (dataType != HIP_R_64F)) + || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F) + && (dataType != HIP_R_64F)) || unaryOp != HIPTENSOR_OP_IDENTITY) { auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index ada3ce61..15972d60 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -26,15 +26,30 @@ # Check whether building within hiptensor context if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) + add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) + add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) + add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) # If building hipTensor samples as a standalone Cmake project else() + add_executable(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) + target_link_libraries(simple_contraction_scale_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) + target_link_libraries(simple_contraction_scale_bf16 PRIVATE hiptensor::hiptensor) add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) + target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) + target_link_libraries(simple_contraction_bilinear_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor) diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp new file mode 100644 index 00000000..0a4a9314 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp @@ -0,0 +1,342 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef hip_bfloat16 ADataType; + typedef hip_bfloat16 BDataType; + typedef hip_bfloat16 CDataType; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_16BF; + hipDataType typeB = HIP_R_16BF; + hipDataType typeC = HIP_R_16BF; + hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.1f; + floatTypeCompute beta = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * + *C_{m,n,u,v} + **********************/ + + std::vector modeC{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeC = modeC.size(); + + std::unordered_map extent; + + extent['m'] = 5; + extent['n'] = 6; + extent['u'] = 3; + extent['v'] = 4; + extent['h'] = 3; + extent['k'] = 4; + + std::vector c_ms_ns_lengths; + for(auto mode : modeC) + { + c_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t c_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &c_ms_ns, + nmodeC, + c_ms_ns_lengths.data(), + NULL, /*stride*/ + typeC, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsC = std::accumulate( + c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeC = sizeof(CDataType) * elementsC; + + ADataType* A = nullptr; + BDataType* B = nullptr; + CDataType* C = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); + + void *A_d, *B_d, *C_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); + + /******************* + * Initialize data + *******************/ + for(int64_t i = 0; i < elementsA; i++) + { + A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsB; i++) + { + B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsC; i++) + { + C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementC; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "c_ms_ns: " << c_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + (void*)&beta, + C_d, + C_d, + workspace, + worksize, + 0 /* stream */)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor C elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorC; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorC.open("tensor_C_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(C); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(C_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16.cpp new file mode 100644 index 00000000..d9d044c9 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f16.cpp @@ -0,0 +1,342 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef _Float16 ADataType; + typedef _Float16 BDataType; + typedef _Float16 CDataType; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_16F; + hipDataType typeB = HIP_R_16F; + hipDataType typeC = HIP_R_16F; + hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.1f; + floatTypeCompute beta = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * + *C_{m,n,u,v} + **********************/ + + std::vector modeC{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeC = modeC.size(); + + std::unordered_map extent; + + extent['m'] = 5; + extent['n'] = 6; + extent['u'] = 3; + extent['v'] = 4; + extent['h'] = 3; + extent['k'] = 4; + + std::vector c_ms_ns_lengths; + for(auto mode : modeC) + { + c_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t c_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &c_ms_ns, + nmodeC, + c_ms_ns_lengths.data(), + NULL, /*stride*/ + typeC, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsC = std::accumulate( + c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeC = sizeof(CDataType) * elementsC; + + ADataType* A = nullptr; + BDataType* B = nullptr; + CDataType* C = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); + + void *A_d, *B_d, *C_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); + + /******************* + * Initialize data + *******************/ + for(int64_t i = 0; i < elementsA; i++) + { + A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsB; i++) + { + B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsC; i++) + { + C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementC; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "c_ms_ns: " << c_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + (void*)&beta, + C_d, + C_d, + workspace, + worksize, + 0 /* stream */)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor C elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorC; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorC.open("tensor_C_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(C); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(C_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16.cpp new file mode 100644 index 00000000..e05916bf --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_bf16.cpp @@ -0,0 +1,334 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + typedef hip_bfloat16 ADataType; + typedef hip_bfloat16 BDataType; + typedef hip_bfloat16 DDataType; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_16BF; + hipDataType typeB = HIP_R_16BF; + hipDataType typeD = HIP_R_16BF; + hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} + **********************/ + + std::vector modeD{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeD = modeD.size(); + + std::unordered_map extent; + + extent['m'] = 5; + extent['n'] = 6; + extent['u'] = 3; + extent['v'] = 4; + extent['h'] = 3; + extent['k'] = 4; + + std::vector d_ms_ns_lengths; + for(auto mode : modeD) + { + d_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t d_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &d_ms_ns, + nmodeD, + d_ms_ns_lengths.data(), + NULL, /*stride*/ + typeD, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsD = std::accumulate( + d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeD = sizeof(DDataType) * elementsD; + + ADataType* A = nullptr; + BDataType* B = nullptr; + DDataType* D = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); + + void *A_d, *B_d, *D_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); + + /******************* + * Initialize data + *******************/ + for(int64_t i = 0; i < elementsA; i++) + { + A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsB; i++) + { + B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsD; i++) + { + D[i] = std::numeric_limits::signaling_NaN(); + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementD; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "d_ms_ns: " << d_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + nullptr, + nullptr, + 0, + &d_ms_ns, + modeD.data(), + alignmentRequirementD, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + nullptr, + nullptr, + D_d, + workspace, + worksize, + 0 /* stream */)); + + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsD < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, D, elementsD); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorD; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorD.open("tensor_D_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); + tensorD.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(D); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(D_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16.cpp new file mode 100644 index 00000000..1e62be85 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f16.cpp @@ -0,0 +1,334 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + typedef _Float16 ADataType; + typedef _Float16 BDataType; + typedef _Float16 DDataType; + typedef float floatTypeCompute; + + hipDataType typeA = HIP_R_16F; + hipDataType typeB = HIP_R_16F; + hipDataType typeD = HIP_R_16F; + hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} + **********************/ + + std::vector modeD{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeD = modeD.size(); + + std::unordered_map extent; + + extent['m'] = 5; + extent['n'] = 6; + extent['u'] = 3; + extent['v'] = 4; + extent['h'] = 3; + extent['k'] = 4; + + std::vector d_ms_ns_lengths; + for(auto mode : modeD) + { + d_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t d_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &d_ms_ns, + nmodeD, + d_ms_ns_lengths.data(), + NULL, /*stride*/ + typeD, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsD = std::accumulate( + d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeD = sizeof(DDataType) * elementsD; + + ADataType* A = nullptr; + BDataType* B = nullptr; + DDataType* D = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); + + void *A_d, *B_d, *D_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); + + /******************* + * Initialize data + *******************/ + for(int64_t i = 0; i < elementsA; i++) + { + A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsB; i++) + { + B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + } + + for(int64_t i = 0; i < elementsD; i++) + { + D[i] = std::numeric_limits::signaling_NaN(); + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementD; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "d_ms_ns: " << d_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + nullptr, + nullptr, + 0, + &d_ms_ns, + modeD.data(), + alignmentRequirementD, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + nullptr, + nullptr, + D_d, + workspace, + worksize, + 0 /* stream */)); + + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsD < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, D, elementsD); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorD; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorD.open("tensor_D_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); + tensorD.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(D); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(D_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index 2bd90e90..a08065a0 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -1,6 +1,8 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: + - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F] - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F] Algorithm Types: diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index 329f1b84..b28e9a88 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -1,6 +1,8 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: + - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] Algorithm Types: diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index 5d745d12..9446157f 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -56,7 +56,8 @@ namespace hiptensor // False = skip test bool ContractionTest::checkDevice(hipDataType datatype) const { - return (isF32Supported() && datatype == HIP_R_32F) + return (isF32Supported() + && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF)) || (isF64Supported() && datatype == HIP_R_64F); } @@ -115,11 +116,15 @@ namespace hiptensor auto CDataType = testType[2]; auto DDataType = testType[3]; - EXPECT_TRUE((ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)); - EXPECT_TRUE((BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)); - EXPECT_TRUE((CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) + EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF) + || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)); + EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF) + || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)); + EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF) + || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) || (CDataType == NONE_TYPE)); - EXPECT_TRUE((DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); + EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF) + || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); mRunFlag &= checkDevice(DDataType); @@ -228,7 +233,35 @@ namespace hiptensor auto resource = getResource(); resource->resizeStorage(lengths, elementBytes); - if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F) + if(ADataType == HIP_R_16F && BDataType == HIP_R_16F && DDataType == HIP_R_16F) + { + // Initialize matrix data on device + fillLaunchKernel<_Float16>((_Float16*)resource->deviceA().get(), elementsA); + fillLaunchKernel<_Float16>((_Float16*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_R_16F) + { + fillLaunchKernel<_Float16>((_Float16*)resource->deviceC().get(), elementsCD); + } + fillValLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(), + elementsCD, + std::numeric_limits<_Float16>::signaling_NaN()); + } + else if(ADataType == HIP_R_16BF && BDataType == HIP_R_16BF && DDataType == HIP_R_16BF) + { + // Initialize matrix data on device + fillLaunchKernel((hip_bfloat16*)resource->deviceA().get(), elementsA); + fillLaunchKernel((hip_bfloat16*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_R_16BF) + { + fillLaunchKernel((hip_bfloat16*)resource->deviceC().get(), + elementsCD); + } + fillValLaunchKernel( + (hip_bfloat16*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); + } + else if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F) { // Initialize matrix data on device fillLaunchKernel((float*)resource->deviceA().get(), elementsA); @@ -328,7 +361,7 @@ namespace hiptensor { auto resource = getResource(); - int size = ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double)); + int size = hipDataTypeSize(DDataType); size_t elementsA = std::accumulate(a_ms_ks.mLengths.begin(), a_ms_ks.mLengths.end(), @@ -346,7 +379,50 @@ namespace hiptensor auto D = resource->allocHost(elementsCD * size); resource->copyData(D, resource->deviceD(), elementsCD * size); - if(DDataType == HIP_R_32F) + if(DDataType == HIP_R_16F) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements<_Float16>(stream, (_Float16*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_R_16BF) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_R_32F) { stream << "Tensor A elements:\n"; hiptensorPrintArrayElements( @@ -456,11 +532,24 @@ namespace hiptensor size_t{1}, std::multiplies()); - int sizeD = elementsCD * ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double)); + int sizeD = elementsCD * hipDataTypeSize(DDataType); auto reference = resource->allocDevice(sizeD); resource->copyData(reference, resource->hostD(), sizeD); - if(DDataType == HIP_R_32F) + if(DDataType == HIP_R_16F) + { + std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>( + (_Float16*)resource->deviceD().get(), (_Float16*)reference.get(), elementsCD); + } + else if(DDataType == HIP_R_16BF) + { + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel( + (hip_bfloat16*)resource->deviceD().get(), + (hip_bfloat16*)reference.get(), + elementsCD); + } + else if(DDataType == HIP_R_32F) { std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD); diff --git a/test/device/common.hpp b/test/device/common.hpp index f961abc1..172e6953 100644 --- a/test/device/common.hpp +++ b/test/device/common.hpp @@ -72,7 +72,7 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed) if(index < elementSize) { - auto value = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize; + auto value = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize; data[index] = static_cast(value); } } diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp index 46f4c43e..cd3eb46f 100644 --- a/test/llvm/yaml_parser_config.cpp +++ b/test/llvm/yaml_parser_config.cpp @@ -110,6 +110,7 @@ namespace llvm static void enumeration(IO& io, hipDataType& value) { io.enumCase(value, "HIP_R_16F", HIP_R_16F); + io.enumCase(value, "HIP_R_16BF", HIP_R_16BF); io.enumCase(value, "HIP_R_32F", HIP_R_32F); io.enumCase(value, "HIP_R_64F", HIP_R_64F); io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE); diff --git a/test/utils.hpp b/test/utils.hpp index 1f7ece44..ad4bb565 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -41,9 +41,9 @@ #include #include #include +#include #include "device/common.hpp" -#include "types.hpp" #define HIPTENSOR_FREE_DEVICE(ptr) \ if(ptr != nullptr) \ From 185a2ab115d4e6e8999917e349ca1a4d803e5228 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Sat, 25 Nov 2023 02:38:48 +0000 Subject: [PATCH 02/12] Add support to f32_f16, f32_bf16, f64_f32 to contraction - Support ABCD data type f32 and compute type f16, bf16 - Support ABCD data type f64 and compute type f32 - Fixed bug: alpha, beta were passed in as wrong data type in unit test of contraction - Create sample template of contraction --- .../contraction/contraction_cpu_reference.cpp | 48 +- .../contraction/contraction_cpu_reference.hpp | 39 +- .../contraction_cpu_reference_instances.cpp | 104 ++- .../contraction/contraction_meta_traits.hpp | 18 +- .../src/contraction/contraction_selection.cpp | 737 ++++++++++++++---- .../src/contraction/contraction_selection.hpp | 9 +- .../contraction/contraction_solution_impl.hpp | 9 +- .../contraction_solution_instances.cpp | 88 ++- .../contraction_solution_params.hpp | 9 +- .../contraction_solution_params_impl.hpp | 6 + .../contraction_solution_registry.cpp | 81 +- .../contraction_solution_registry.hpp | 60 +- library/src/contraction/device/CMakeLists.txt | 28 +- ...16_bf16_bf16_compute_f32_kknn_instance.cpp | 27 +- ...16_bf16_bf16_compute_f32_knnn_instance.cpp | 27 +- ...16_bf16_bf16_compute_f32_mknn_instance.cpp | 27 +- ...16_bf16_bf16_compute_f32_mnnn_instance.cpp | 27 +- ..._f16_f16_f16_compute_f32_kknn_instance.cpp | 27 +- ..._f16_f16_f16_compute_f32_knnn_instance.cpp | 27 +- ..._f16_f16_f16_compute_f32_mknn_instance.cpp | 27 +- ..._f16_f16_f16_compute_f32_mnnn_instance.cpp | 27 +- ...f32_f32_f32_compute_bf16_kknn_instance.cpp | 85 ++ ...f32_f32_f32_compute_bf16_knnn_instance.cpp | 85 ++ ...f32_f32_f32_compute_bf16_mknn_instance.cpp | 85 ++ ...f32_f32_f32_compute_bf16_mnnn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_f16_kknn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_f16_knnn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_f16_mknn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_f16_mnnn_instance.cpp | 85 ++ ..._f64_f64_f64_compute_f32_kknn_instance.cpp | 85 ++ ..._f64_f64_f64_compute_f32_knnn_instance.cpp | 85 ++ ..._f64_f64_f64_compute_f32_mknn_instance.cpp | 85 ++ ..._f64_f64_f64_compute_f32_mnnn_instance.cpp | 85 ++ ...f16_bf16_bf16_compute_f32_kkn_instance.cpp | 27 +- ...f16_bf16_bf16_compute_f32_knn_instance.cpp | 27 +- ...f16_bf16_bf16_compute_f32_mkn_instance.cpp | 27 +- ...f16_bf16_bf16_compute_f32_mnn_instance.cpp | 27 +- ...e_f16_f16_f16_compute_f32_kkn_instance.cpp | 27 +- ...e_f16_f16_f16_compute_f32_knn_instance.cpp | 27 +- ...e_f16_f16_f16_compute_f32_mkn_instance.cpp | 27 +- ...e_f16_f16_f16_compute_f32_mnn_instance.cpp | 27 +- ..._f32_f32_f32_compute_bf16_kkn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_bf16_knn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_bf16_mkn_instance.cpp | 85 ++ ..._f32_f32_f32_compute_bf16_mnn_instance.cpp | 85 ++ ...e_f32_f32_f32_compute_f16_kkn_instance.cpp | 85 ++ ...e_f32_f32_f32_compute_f16_knn_instance.cpp | 85 ++ ...e_f32_f32_f32_compute_f16_mkn_instance.cpp | 85 ++ ...e_f32_f32_f32_compute_f16_mnn_instance.cpp | 85 ++ ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 62 ++ ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 62 ++ ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 62 ++ ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 62 ++ .../src/contraction/hiptensor_contraction.cpp | 59 +- library/src/data_types.cpp | 43 + library/src/include/data_types.hpp | 2 + samples/01_contraction/CMakeLists.txt | 31 + .../simple_bilinear_contraction.hpp | 351 +++++++++ .../simple_bilinear_contraction_bf16.cpp | 313 +------- .../simple_bilinear_contraction_f16.cpp | 313 +------- .../simple_bilinear_contraction_f32.cpp | 313 +------- .../simple_bilinear_contraction_f32_bf16.cpp | 57 ++ .../simple_bilinear_contraction_f32_f16.cpp | 57 ++ .../simple_bilinear_contraction_f64.cpp | 57 ++ .../simple_bilinear_contraction_f64_f32.cpp | 57 ++ .../simple_scale_contraction.hpp | 341 ++++++++ .../simple_scale_contraction_bf16.cpp | 311 +------- .../simple_scale_contraction_f16.cpp | 317 +------- .../simple_scale_contraction_f32.cpp | 310 +------- .../simple_scale_contraction_f32_bf16.cpp | 58 ++ .../simple_scale_contraction_f32_f16.cpp | 58 ++ .../simple_scale_contraction_f64.cpp | 57 ++ .../simple_scale_contraction_f64_f32.cpp | 57 ++ .../configs/bilinear_test_params.yaml | 7 +- .../configs/scale_test_params.yaml | 3 + test/01_contraction/contraction_test.cpp | 18 +- 76 files changed, 4657 insertions(+), 2149 deletions(-) create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp create mode 100644 library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction.hpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f64.cpp create mode 100644 samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp create mode 100644 samples/01_contraction/simple_scale_contraction.hpp create mode 100644 samples/01_contraction/simple_scale_contraction_f32_bf16.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_f32_f16.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_f64.cpp create mode 100644 samples/01_contraction/simple_scale_contraction_f64_f32.cpp diff --git a/library/src/contraction/contraction_cpu_reference.cpp b/library/src/contraction/contraction_cpu_reference.cpp index 13dcdffd..ac1d9711 100644 --- a/library/src/contraction/contraction_cpu_reference.cpp +++ b/library/src/contraction/contraction_cpu_reference.cpp @@ -28,31 +28,33 @@ #include "contraction_cpu_reference_impl.hpp" #include "contraction_cpu_reference_instances.hpp" -hiptensorStatus_t hiptensorContractionReference(void const* alpha, - void const* A, - void const* B, - void const* beta, - void const* C, - void* D, - std::vector const& a_ms_ks_lengths, - std::vector const& a_ms_ks_strides, - std::vector const& b_ns_ks_lengths, - std::vector const& b_ns_ks_strides, - std::vector const& c_ms_ns_lengths, - std::vector const& c_ms_ns_strides, - std::vector const& d_ms_ns_lengths, - std::vector const& d_ms_ns_strides, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - void* workspace) +hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan, + void const* alpha, + void const* A, + void const* B, + void const* beta, + void const* C, + void* D, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + std::vector const& c_ms_ns_lengths, + std::vector const& c_ms_ns_strides, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + void* workspace) { - auto& instances = hiptensor::ContractionCpuReferenceInstances::instance(); + auto& instances = hiptensor::ContractionCpuReferenceInstances::instance(); + auto computeType = plan->mContractionDesc.mComputeType; auto candidates - = (C == nullptr) - ? instances->allSolutions().query(typeA, typeB, hiptensor::NONE_TYPE, typeD) - : instances->allSolutions().query(typeA, typeB, typeC, typeD); + = (C == nullptr) ? instances->allSolutions().query( + typeA, typeB, hiptensor::NONE_TYPE, typeD, computeType) + : instances->allSolutions().query(typeA, typeB, typeC, typeD, computeType); auto toCKVec = [](auto& inputVec) { return std::vector(inputVec.begin(), inputVec.end()); }; diff --git a/library/src/contraction/contraction_cpu_reference.hpp b/library/src/contraction/contraction_cpu_reference.hpp index aadb062e..471026dc 100644 --- a/library/src/contraction/contraction_cpu_reference.hpp +++ b/library/src/contraction/contraction_cpu_reference.hpp @@ -32,24 +32,25 @@ #include -hiptensorStatus_t hiptensorContractionReference(void const* alpha, - void const* A, - void const* B, - void const* beta, - void const* C, - void* D, - std::vector const& a_ms_ks_lengths, - std::vector const& a_ms_ks_strides, - std::vector const& b_ks_ns_lengths, - std::vector const& b_ks_ns_strides, - std::vector const& c_ms_ns_lengths, - std::vector const& c_ms_ns_strides, - std::vector const& d_ms_ns_lengths, - std::vector const& d_ms_ns_strides, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - void* workspace); +hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan, + void const* alpha, + void const* A, + void const* B, + void const* beta, + void const* C, + void* D, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + std::vector const& b_ks_ns_lengths, + std::vector const& b_ks_ns_strides, + std::vector const& c_ms_ns_lengths, + std::vector const& c_ms_ns_strides, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + void* workspace); #endif // HIPTENSOR_CONTRACTION_CPU_REFERENCE_HPP diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 146d2721..173a49e9 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -37,10 +37,10 @@ namespace hiptensor enumerateReferenceSolutions<2, 2, 2, - _Float16, - _Float16, - ck::Tuple<_Float16>, - _Float16, + ck::half_t, + ck::half_t, + ck::Tuple, + ck::half_t, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Bilinear, @@ -71,7 +71,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + ck::half_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::bhalf_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); // Bilinear f64 registerSolutions( @@ -84,17 +111,31 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + double, + double, + ck::Tuple, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + double>()); // Scale f16 registerSolutions( enumerateReferenceSolutions<2, 2, 2, - _Float16, - _Float16, + ck::half_t, + ck::half_t, ck::Tuple<>, - _Float16, + ck::half_t, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::Scale, @@ -125,7 +166,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + ck::half_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::bhalf_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); // Scale f64 registerSolutions( @@ -138,6 +206,20 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + double, + double, + ck::Tuple<>, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + double>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index ab158f96..6a7cb35f 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -75,9 +75,12 @@ namespace hiptensor = std::conditional_t, hip_bfloat16, DsDataType>; using EDataT = std::conditional_t, hip_bfloat16, EDataType>; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Bilinear; + using ComputeDataT = std::conditional_t, + hip_bfloat16, + ComputeDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Bilinear; }; // Partial specialize for Scale contraction @@ -113,9 +116,12 @@ namespace hiptensor using DDataT = NoneType; using EDataT = std::conditional_t, hip_bfloat16, EDataType>; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Scale; + using ComputeDataT = std::conditional_t, + hip_bfloat16, + ComputeDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Scale; }; } // namespace hiptensor diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index 1b2cf92e..888ef4c1 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -54,6 +54,7 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize) { // Make sure that we calculate full element space incase strides are not packed. @@ -70,9 +71,11 @@ namespace hiptensor auto sizeE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides) * hipDataTypeSize(typeE); - void *A_d, *B_d, *D_d, *E_d, *wspace; - float alpha = 1.02f; - float beta = 1.03f; + void * A_d, *B_d, *D_d, *E_d, *wspace; + double alpha = 0.0d; + double beta = 0.0d; + writeVal(&alpha, computeType, 1.02); + writeVal(&beta, computeType, 1.03); CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA)); CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB)); @@ -150,9 +153,13 @@ namespace hiptensor } } - // test template <> - struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::SCALE> + struct ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::SCALE, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -196,7 +203,12 @@ namespace hiptensor }; template <> - struct ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, ContractionOpId_t::BILINEAR> + struct ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::BILINEAR, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -224,7 +236,7 @@ namespace hiptensor size_t unique_id = 0; - // select unique_id + // TODO select unique_id unique_id = 7255639152084218514; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) @@ -244,7 +256,8 @@ namespace hiptensor hip_bfloat16, hip_bfloat16, hip_bfloat16, - ContractionOpId_t::SCALE> + ContractionOpId_t::SCALE, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -292,7 +305,8 @@ namespace hiptensor hip_bfloat16, hip_bfloat16, hip_bfloat16, - ContractionOpId_t::BILINEAR> + ContractionOpId_t::BILINEAR, + float> { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -320,7 +334,7 @@ namespace hiptensor size_t unique_id = 0; - // select unique_id + // TODO select unique_id unique_id = 8689089455041651212; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) @@ -334,10 +348,183 @@ namespace hiptensor } } }; - // end test template <> - struct ActorCriticSelection + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -702,7 +889,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1060,7 +1247,92 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1335,7 +1607,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1602,181 +1874,344 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize) { - if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F) + if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F + && computeType == HIP_R_32F) { return ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, - ContractionOpId_t::SCALE>::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::SCALE, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F - && typeE == HIP_R_16F) + else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F + && computeType == HIP_R_32F) { return ActorCriticSelection<_Float16, _Float16, _Float16, _Float16, - ContractionOpId_t::BILINEAR>::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::BILINEAR, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE - && typeE == HIP_R_16BF) + && typeE == HIP_R_16BF && computeType == HIP_R_32F) { return ActorCriticSelection::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::SCALE, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF - && typeE == HIP_R_16BF) + && typeE == HIP_R_16BF && computeType == HIP_R_32F) { return ActorCriticSelection::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::BILINEAR, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_16F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_16F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_16BF) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE - && typeE == HIP_R_32F) + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_32F) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F - && typeE == HIP_R_32F) + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_32F) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE - && typeE == HIP_R_64F) + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F + && computeType == HIP_R_32F) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F + && computeType == HIP_R_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F + && computeType == HIP_R_64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F - && typeE == HIP_R_64F) + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F + && computeType == HIP_R_64F) { return ActorCriticSelection::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::BILINEAR, + double>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } return HIPTENSOR_STATUS_EXECUTION_FAILED; } diff --git a/library/src/contraction/contraction_selection.hpp b/library/src/contraction/contraction_selection.hpp index 9ceb6a14..deb980d9 100644 --- a/library/src/contraction/contraction_selection.hpp +++ b/library/src/contraction/contraction_selection.hpp @@ -49,9 +49,15 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize); - template + template struct ActorCriticSelection { static hiptensorStatus_t @@ -87,6 +93,7 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize); } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 5e191441..3b672fbb 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -95,11 +95,13 @@ namespace hiptensor if(alpha != nullptr) { - alphaF = hiptensor::readVal(alpha, HipDataType_v); + alphaF = hiptensor::readVal( + alpha, convertToComputeType(HipDataType_v)); } if(beta != nullptr) { - betaF = hiptensor::readVal(beta, HipDataType_v); + betaF = hiptensor::readVal( + beta, convertToComputeType(HipDataType_v)); } // CK has its own format for indices... @@ -205,7 +207,8 @@ namespace hiptensor if(alpha != nullptr) { - alphaF = hiptensor::readVal(alpha, HipDataType_v); + alphaF = hiptensor::readVal( + alpha, convertToComputeType(HipDataType_v)); } // CK has its own format for indices... diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index 6d481577..aec12e32 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -72,7 +72,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::half_t>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::bhalf_t>()); // Bilinear f64 registerSolutions( @@ -85,7 +112,20 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + double, + double, + ck::Tuple, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + double>()); // Scale bf16 registerSolutions( @@ -126,8 +166,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::half_t>()); + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::bhalf_t>()); // Scale f64 registerSolutions( enumerateContractionSolutions<2, @@ -139,6 +205,20 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + double, + double, + ck::Tuple<>, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + double>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp index ec9de45c..4c44de88 100644 --- a/library/src/contraction/contraction_solution_params.hpp +++ b/library/src/contraction/contraction_solution_params.hpp @@ -49,10 +49,11 @@ namespace hiptensor virtual int32_t dimsK() const = 0; // Map to hipDataType - virtual hipDataType typeA() const = 0; - virtual hipDataType typeB() const = 0; - virtual hipDataType typeC() const = 0; - virtual hipDataType typeD() const = 0; + virtual hipDataType typeA() const = 0; + virtual hipDataType typeB() const = 0; + virtual hipDataType typeC() const = 0; + virtual hipDataType typeD() const = 0; + virtual hiptensorComputeType_t typeCompute() const = 0; // Map to operators virtual hiptensorOperator_t opA() const = 0; diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp index bff33960..b84f9c2b 100644 --- a/library/src/contraction/contraction_solution_params_impl.hpp +++ b/library/src/contraction/contraction_solution_params_impl.hpp @@ -42,6 +42,7 @@ namespace std return hiptensor::Hash{}(s.dimsM(), s.dimsN(), s.dimsK(), + s.typeCompute(), s.typeA(), s.typeB(), s.typeC(), @@ -102,6 +103,11 @@ namespace hiptensor return HipDataType_v; } + hiptensorComputeType_t typeCompute() const override + { + return convertToComputeType(HipDataType_v); + } + hiptensorOperator_t opA() const override { return ElementWiseOperatorType_v; diff --git a/library/src/contraction/contraction_solution_registry.cpp b/library/src/contraction/contraction_solution_registry.cpp index 83674c81..9e2da1f9 100644 --- a/library/src/contraction/contraction_solution_registry.cpp +++ b/library/src/contraction/contraction_solution_registry.cpp @@ -53,19 +53,20 @@ namespace hiptensor } ContractionSolutionRegistry::Query - ContractionSolutionRegistry::Query::query(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) const + ContractionSolutionRegistry::Query::query(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) const { - auto solutionHash - = hashSolution(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE); + auto solutionHash = hashSolution( + dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute); if(auto solutions = mSolutionHash.find(solutionHash); solutions != mSolutionHash.end()) { @@ -81,10 +82,14 @@ namespace hiptensor return query(hashDimsMNK(dimsM, dimsN, dimsK)); } - ContractionSolutionRegistry::Query ContractionSolutionRegistry::Query::query( - hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) const + ContractionSolutionRegistry::Query + ContractionSolutionRegistry::Query::query(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) const { - return query(hashTypesABCD(typeA, typeB, typeC, typeD)); + return query(hashTypesComputeABCD(typeA, typeB, typeC, typeD, typeCompute)); } ContractionSolutionRegistry::Query @@ -159,18 +164,20 @@ namespace hiptensor /* static */ ContractionSolutionRegistry::Query::HashId - ContractionSolutionRegistry::Query::hashSolution(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) + ContractionSolutionRegistry::Query::hashSolution(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) { - return Hash{}(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE); + return Hash{}( + dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute); } /* static */ @@ -181,10 +188,14 @@ namespace hiptensor } /* static */ - ContractionSolutionRegistry::Query::HashId ContractionSolutionRegistry::Query::hashTypesABCD( - hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) + ContractionSolutionRegistry::Query::HashId + ContractionSolutionRegistry::Query::hashTypesComputeABCD(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) { - return Hash{}(typeA, typeB, typeC, typeD); + return Hash{}(typeA, typeB, typeC, typeD, typeCompute); } /* static */ @@ -220,12 +231,16 @@ namespace hiptensor params->typeD(), params->opA(), params->opB(), - params->opCDE()); + params->opCDE(), + params->typeCompute()); auto dimsMNKHash = hashDimsMNK(params->dimsM(), params->dimsN(), params->dimsK()); - auto typesABCDHash - = hashTypesABCD(params->typeA(), params->typeB(), params->typeC(), params->typeD()); + auto typesComputeABCDHash = hashTypesComputeABCD(params->typeA(), + params->typeB(), + params->typeC(), + params->typeD(), + params->typeCompute()); auto elementOpsHash = hashElementOps(params->opA(), params->opB()); @@ -236,7 +251,7 @@ namespace hiptensor mAllSolutions[solutionUid] = solution; mSolutionHash[solutionHash].push_back(solution); mSolutionHash[dimsMNKHash].push_back(solution); - mSolutionHash[typesABCDHash].push_back(solution); + mSolutionHash[typesComputeABCDHash].push_back(solution); mSolutionHash[elementOpsHash].push_back(solution); mSolutionHash[contactionOpsHash].push_back(solution); } diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp index d1b80ec5..44aaa97d 100644 --- a/library/src/contraction/contraction_solution_registry.hpp +++ b/library/src/contraction/contraction_solution_registry.hpp @@ -59,25 +59,27 @@ namespace hiptensor /// E.g. in this context, query further parameters. // By full solution type - Query query(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) const; + Query query(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) const; // By dimensions Query query(int32_t dimsM, int32_t dimsN, int32_t dimsK) const; // By data types - Query query(hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD) const; + Query query(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) const; // By element-wise operations Query query(hiptensorOperator_t opA, hiptensorOperator_t opB) const; @@ -104,22 +106,24 @@ namespace hiptensor Query query(HashId queryHash) const; // Hashing helpers - static HashId hashSolution(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE); + static HashId hashSolution(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute); static HashId hashDimsMNK(int32_t dimsM, int32_t dimsN, int32_t dimsK); - static HashId hashTypesABCD(hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD); + static HashId hashTypesComputeABCD(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute); static HashId hashElementOps(hiptensorOperator_t opA, hiptensorOperator_t opB); static HashId hashContractionOps(ContractionOpId_t opCDE); diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index b9b382c0..eacac5b1 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -33,10 +33,22 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -49,15 +61,27 @@ ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp ) - add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) - target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) +add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) +target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp index 7d777a83..3b3f6d47 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp index a9a97148..fd43f0ad 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp index d83d8d16..21fb8127 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp index bc49c82b..cc975c03 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp index a9d963ab..ff670630 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp index c139942e..be8bfe84 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp index 3c6ced30..4be69898 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp index 33c66296..2f6d630b 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp new file mode 100644 index 00000000..cc21216c --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp new file mode 100644 index 00000000..57c47457 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp new file mode 100644 index 00000000..a121fbb3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp new file mode 100644 index 00000000..7962da9f --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp new file mode 100644 index 00000000..ea2be147 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp new file mode 100644 index 00000000..d82ea442 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp new file mode 100644 index 00000000..772df2e3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp new file mode 100644 index 00000000..8b1d0681 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..ad5ce461 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..ae3ee856 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..b72005ad --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..b94030e5 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp index 05400151..1da8301f 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp index bba95b14..82c17500 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp index fb5ecec0..1febb560 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp index 1dd6613c..02b9d719 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp index e98aee20..5917e466 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp index db8de1c0..216f470e 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp index 397ef327..3401b605 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp index 1f9221dc..fe2fa97d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp new file mode 100644 index 00000000..9a104075 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp new file mode 100644 index 00000000..6a7f565f --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp new file mode 100644 index 00000000..094655bb --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp new file mode 100644 index 00000000..583b5b00 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp new file mode 100644 index 00000000..8eec79cf --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp new file mode 100644 index 00000000..a8999be8 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp new file mode 100644 index 00000000..e4e4b7de --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp new file mode 100644 index 00000000..a641f6e3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..04176d80 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..06481fc7 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..94922008 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..e70b854b --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index b96a204e..c7b7501b 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -242,17 +242,6 @@ hiptensorStatus_t hiptensorInitContractionFind(const hiptensorHandle_t* handl auto& instances = hiptensor::ContractionSolutionInstances::instance(); auto solnQ = instances->allSolutions(); - // Check if the current device supports F64 - if(!currentDevice.supportsF64()) - { - // Allow only supported f32 combos - solnQ = solnQ.query(HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F) || // Bilinear F32 - solnQ.query(HIP_R_32F, - HIP_R_32F, - hipDataType(hiptensor::NONE_TYPE), - HIP_R_32F); // Scale F32 (no C) - } - // Can do more checking for scale / bilinear, etc. if we need to. if(solnQ.solutionCount() == 0) @@ -461,15 +450,16 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* // Convert to concrete contraction solutions auto candidates = toContractionSolutionVec(find->mCandidates); - auto ADataType = desc->mTensorDesc[0].mType; - auto BDataType = desc->mTensorDesc[1].mType; - auto DDataType = desc->mTensorDesc[2].mType; - auto EDataType = desc->mTensorDesc[3].mType; + auto computeType = desc->mComputeType; + auto ADataType = desc->mTensorDesc[0].mType; + auto BDataType = desc->mTensorDesc[1].mType; + auto DDataType = desc->mTensorDesc[2].mType; + auto EDataType = desc->mTensorDesc[3].mType; // Query contraction solutions for the correct contraction operation and type auto solutionQ = hiptensor::ContractionSolutionRegistry::Query{candidates} .query((hiptensor::ContractionOpId_t)desc->mContractionOpId) - .query(ADataType, BDataType, DDataType, EDataType); + .query(ADataType, BDataType, DDataType, EDataType, computeType); candidates = toContractionSolutionVec(solutionQ.solutions()); @@ -500,6 +490,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* EDataType, desc->mTensorDesc[3].mLengths, desc->mTensorDesc[3].mStrides, + desc->mComputeType, workspaceSize); } else if(find->mSelectionAlgorithm == HIPTENSOR_ALGO_ACTOR_CRITIC) @@ -518,6 +509,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* EDataType, desc->mTensorDesc[3].mLengths, desc->mTensorDesc[3].mStrides, + desc->mComputeType, workspaceSize); } @@ -582,18 +574,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F) - { - snprintf( - alphaMsg, sizeof(alphaMsg), "alpha=%.6f", *(static_cast(alpha))); - } - else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F) - { - snprintf(alphaMsg, - sizeof(alphaMsg), - "alpha=%.6lf", - *(static_cast(alpha))); - } + auto alphaValue + = hiptensor::readVal(alpha, plan->mContractionDesc.mComputeType); + snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%.6lf", alphaValue); } if(beta == nullptr) @@ -602,15 +585,8 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F) - { - snprintf(betaMsg, sizeof(betaMsg), "beta=%.6f", *(static_cast(beta))); - } - else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F) - { - snprintf( - betaMsg, sizeof(betaMsg), "beta=%.6lf", *(static_cast(beta))); - } + auto betaValue = hiptensor::readVal(beta, plan->mContractionDesc.mComputeType); + snprintf(betaMsg, sizeof(betaMsg), "beta=%.6lf", betaValue); } } else @@ -745,6 +721,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE) { auto time = (*cSolution)(StreamConfig{stream, true}); + if(time < 0) + { + return HIPTENSOR_STATUS_CK_ERROR; + } int32_t m, n, k; std::tie(m, n, k) = cSolution->problemDims(); @@ -773,7 +753,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, // Perform contraction without timing else { - (*cSolution)(StreamConfig{stream, false}); + if((*cSolution)(StreamConfig{stream, false}) < 0) + { + return HIPTENSOR_STATUS_CK_ERROR; + } } return HIPTENSOR_STATUS_SUCCESS; diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp index b270973d..38e9f186 100644 --- a/library/src/data_types.cpp +++ b/library/src/data_types.cpp @@ -132,6 +132,49 @@ namespace hiptensor } } + void writeVal(void const* addr, hiptensorComputeType_t id, double value) + { + if(id == HIPTENSOR_COMPUTE_16F) + { + *(_Float16*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_16BF) + { + *(hip_bfloat16*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32F) + { + *(float*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_64F) + { + *(double*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_8U) + { + *(uint8_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_8I) + { + *(int8_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32U) + { + *(uint32_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32I) + { + *(int32_t*)addr = value; + } + else + { +#if !NDEBUG + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; +#endif // !NDEBUG + return; + } + } + } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType) diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp index 42197650..19ccca6c 100644 --- a/library/src/include/data_types.hpp +++ b/library/src/include/data_types.hpp @@ -65,6 +65,8 @@ namespace hiptensor template T readVal(void const* value, hiptensorComputeType_t id); + void writeVal(void const* addr, hiptensorComputeType_t id, double value); + } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType); diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index 15972d60..de834d72 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -29,9 +29,17 @@ if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) + add_hiptensor_sample(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp) + add_hiptensor_sample(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp) + add_hiptensor_sample(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp) + add_hiptensor_sample(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp) add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp) + add_hiptensor_sample(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp) # If building hipTensor samples as a standalone Cmake project else() @@ -44,6 +52,18 @@ else() add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp) + target_link_libraries(simple_contraction_scale_f32_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp) + target_link_libraries(simple_contraction_scale_f32_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp) + target_link_libraries(simple_contraction_scale_f64 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp) + target_link_libraries(simple_contraction_scale_f64_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor) @@ -53,4 +73,15 @@ else() add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp) + target_link_libraries(simple_contraction_bilinear_f32_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp) + target_link_libraries(simple_contraction_bilinear_f32_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp) + target_link_libraries(simple_contraction_bilinear_f64 PRIVATE hiptensor::hiptensor) + + add_executable(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp) + target_link_libraries(simple_contraction_bilinear_f64_f32 PRIVATE hiptensor::hiptensor) endif() diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp new file mode 100644 index 00000000..aaef4a1b --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction.hpp @@ -0,0 +1,351 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +template +int bilinearContractionSample() +{ + floatTypeCompute alpha = (floatTypeCompute)1.0f; + floatTypeCompute beta = (floatTypeCompute)1.0f; + + /********************** + * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * + *C_{m,n,u,v} + **********************/ + + std::vector modeC{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeC = modeC.size(); + + std::unordered_map extent; + + extent['m'] = 4; + extent['n'] = 3; + extent['u'] = 4; + extent['v'] = 3; + extent['h'] = 6; + extent['k'] = 5; + + std::vector c_ms_ns_lengths; + for(auto mode : modeC) + { + c_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t c_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &c_ms_ns, + nmodeC, + c_ms_ns_lengths.data(), + NULL, /*stride*/ + typeC, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsC = std::accumulate( + c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeC = sizeof(CDataType) * elementsC; + + ADataType* A = nullptr; + BDataType* B = nullptr; + CDataType* C = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); + + void *A_d, *B_d, *C_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); + + /******************* + * Initialize data + *******************/ + int initMethod = 0; // TODO read value from commandline + for(int64_t i = 0; i < elementsA; i++) + { + if(initMethod == 0) + { + A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + A[i] = (ADataType)(float(i) / 100); + } + } + + for(int64_t i = 0; i < elementsB; i++) + { + if(initMethod == 0) + { + B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + B[i] = (BDataType)(float(i) / 100); + } + } + + for(int64_t i = 0; i < elementsC; i++) + { + if(initMethod == 0) + { + C[i] = CDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + C[i] = (BDataType)(float(i) / 100); + } + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementC; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "c_ms_ns: " << c_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + &c_ms_ns, + modeC.data(), + alignmentRequirementC, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + (void*)&beta, + C_d, + C_d, + workspace, + worksize, + 0 /* stream */)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor C elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorC; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorC.open("tensor_C_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(C); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(C_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp index 0a4a9314..f6714a2f 100644 --- a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_bf16.cpp @@ -23,17 +23,7 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_bilinear_contraction.hpp" int main(int argc, char* argv[]) { @@ -51,292 +41,17 @@ int main(int argc, char* argv[]) typedef hip_bfloat16 CDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_16BF; - hipDataType typeB = HIP_R_16BF; - hipDataType typeC = HIP_R_16BF; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.1f; - floatTypeCompute beta = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * - *C_{m,n,u,v} - **********************/ - - std::vector modeC{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeC = modeC.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector c_ms_ns_lengths; - for(auto mode : modeC) - { - c_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t c_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &c_ms_ns, - nmodeC, - c_ms_ns_lengths.data(), - NULL, /*stride*/ - typeC, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsC = std::accumulate( - c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeC = sizeof(CDataType) * elementsC; - - ADataType* A = nullptr; - BDataType* B = nullptr; - CDataType* C = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); - - void *A_d, *B_d, *C_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsC; i++) - { - C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementC; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "c_ms_ns: " << c_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - (void*)&beta, - C_d, - C_d, - workspace, - worksize, - 0 /* stream */)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsC < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor C elements:\n"; - hiptensorPrintArrayElements(std::cout, C, elementsC); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorC; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorC.open("tensor_C_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); - tensorC.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(C); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(C_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_16BF; + constexpr hipDataType typeB = HIP_R_16BF; + constexpr hipDataType typeC = HIP_R_16BF; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16.cpp index d9d044c9..40708c77 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f16.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f16.cpp @@ -23,17 +23,7 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_bilinear_contraction.hpp" int main(int argc, char* argv[]) { @@ -51,292 +41,17 @@ int main(int argc, char* argv[]) typedef _Float16 CDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_16F; - hipDataType typeB = HIP_R_16F; - hipDataType typeC = HIP_R_16F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.1f; - floatTypeCompute beta = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * - *C_{m,n,u,v} - **********************/ - - std::vector modeC{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeC = modeC.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector c_ms_ns_lengths; - for(auto mode : modeC) - { - c_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t c_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &c_ms_ns, - nmodeC, - c_ms_ns_lengths.data(), - NULL, /*stride*/ - typeC, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsC = std::accumulate( - c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeC = sizeof(CDataType) * elementsC; - - ADataType* A = nullptr; - BDataType* B = nullptr; - CDataType* C = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); - - void *A_d, *B_d, *C_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsC; i++) - { - C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementC; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "c_ms_ns: " << c_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - (void*)&beta, - C_d, - C_d, - workspace, - worksize, - 0 /* stream */)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsC < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor C elements:\n"; - hiptensorPrintArrayElements(std::cout, C, elementsC); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorC; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorC.open("tensor_C_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); - tensorC.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(C); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(C_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_16F; + constexpr hipDataType typeB = HIP_R_16F; + constexpr hipDataType typeC = HIP_R_16F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32.cpp index 5704a59d..ee046145 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f32.cpp +++ b/samples/01_contraction/simple_bilinear_contraction_f32.cpp @@ -23,17 +23,7 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_bilinear_contraction.hpp" int main(int argc, char* argv[]) { @@ -51,292 +41,17 @@ int main(int argc, char* argv[]) typedef float CDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_32F; - hipDataType typeB = HIP_R_32F; - hipDataType typeC = HIP_R_32F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.1f; - floatTypeCompute beta = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = alpha * A_{m,n,h,k} B_{u,v,h,k} + beta * - *C_{m,n,u,v} - **********************/ - - std::vector modeC{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeC = modeC.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector c_ms_ns_lengths; - for(auto mode : modeC) - { - c_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t c_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &c_ms_ns, - nmodeC, - c_ms_ns_lengths.data(), - NULL, /*stride*/ - typeC, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsC = std::accumulate( - c_ms_ns_lengths.begin(), c_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeC = sizeof(CDataType) * elementsC; - - ADataType* A = nullptr; - BDataType* B = nullptr; - CDataType* C = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&C, sizeC)); - - void *A_d, *B_d, *C_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&C_d), sizeC)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsC; i++) - { - C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(C_d, static_cast(C), sizeC, hipMemcpyHostToDevice)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementC; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, C_d, &c_ms_ns, &alignmentRequirementC)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "c_ms_ns: " << c_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - &c_ms_ns, - modeC.data(), - alignmentRequirementC, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - (void*)&beta, - C_d, - C_d, - workspace, - worksize, - 0 /* stream */)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsC < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor C elements:\n"; - hiptensorPrintArrayElements(std::cout, C, elementsC); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorC; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorC.open("tensor_C_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); - tensorC.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(C); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(C_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); } diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp new file mode 100644 index 00000000..42f60ecb --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef hip_bfloat16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp new file mode 100644 index 00000000..d39a4fca --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef _Float16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64.cpp new file mode 100644 index 00000000..412ebbc5 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f64.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double CDataType; + typedef double floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeC = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp new file mode 100644 index 00000000..673c4768 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeC = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp new file mode 100644 index 00000000..e9d482c3 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction.hpp @@ -0,0 +1,341 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +template +int scaleContractionSample() +{ + floatTypeCompute alpha = (floatTypeCompute)1.0f; + /********************** + * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} + **********************/ + + std::vector modeD{'m', 'n', 'u', 'v'}; + std::vector modeA{'m', 'n', 'h', 'k'}; + std::vector modeB{'u', 'v', 'h', 'k'}; + + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeD = modeD.size(); + + std::unordered_map extent; + + extent['m'] = 4; + extent['n'] = 3; + extent['u'] = 4; + extent['v'] = 3; + extent['h'] = 6; + extent['k'] = 5; + + std::vector d_ms_ns_lengths; + for(auto mode : modeD) + { + d_ms_ns_lengths.push_back(extent[mode]); + } + + std::vector a_ms_ks_lengths; + for(auto mode : modeA) + { + a_ms_ks_lengths.push_back(extent[mode]); + } + + std::vector b_ns_ks_lengths; + for(auto mode : modeB) + { + b_ns_ks_lengths.push_back(extent[mode]); + } + + hiptensorHandle_t* handle; + CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); + + CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); + + /******************************************** + * Initialize tensors with the input lengths * + ********************************************/ + hiptensorTensorDescriptor_t a_ms_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &a_ms_ks, + nmodeA, + a_ms_ks_lengths.data(), + NULL, /*stride*/ + typeA, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t b_ns_ks; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &b_ns_ks, + nmodeB, + b_ns_ks_lengths.data(), + NULL, /*stride*/ + typeB, + HIPTENSOR_OP_IDENTITY)); + + hiptensorTensorDescriptor_t d_ms_ns; + CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, + &d_ms_ns, + nmodeD, + d_ms_ns_lengths.data(), + NULL, /*stride*/ + typeD, + HIPTENSOR_OP_IDENTITY)); + + /********************** + * Allocating data + **********************/ + std::cout << "Initializing host data..." << std::endl; + + size_t elementsA = std::accumulate( + a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsB = std::accumulate( + b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); + size_t elementsD = std::accumulate( + d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); + + size_t sizeA = sizeof(ADataType) * elementsA; + size_t sizeB = sizeof(BDataType) * elementsB; + size_t sizeD = sizeof(DDataType) * elementsD; + + ADataType* A = nullptr; + BDataType* B = nullptr; + DDataType* D = nullptr; + CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); + CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); + + void *A_d, *B_d, *D_d; + + CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); + CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); + + /******************* + * Initialize data + *******************/ + int initMethod = 0; // TODO read the value from command line + for(int64_t i = 0; i < elementsA; i++) + { + if(initMethod == 0) + { + A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + A[i] = (ADataType)(float(i) / 100); + } + } + + for(int64_t i = 0; i < elementsB; i++) + { + if(initMethod == 0) + { + B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + B[i] = (BDataType)(float(i) / 100); + } + } + + for(int64_t i = 0; i < elementsD; i++) + { + D[i] = std::numeric_limits::signaling_NaN(); + } + + /******************************************** + * Transfer the Host Tensor to Device Memory * + ********************************************/ + std::cout << "Initializing device data..." << std::endl; + + CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); + + /************************************************ + * Retrieve the memory alignment for each tensor + ************************************************/ + uint32_t alignmentRequirementA; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); + + uint32_t alignmentRequirementD; + CHECK_HIPTENSOR_ERROR( + hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + std::cout << "a_ms_ks: " << a_ms_ks << std::endl; + std::cout << "b_ns_ks: " << b_ns_ks << std::endl; + std::cout << "d_ms_ns: " << d_ms_ns << std::endl; + + hiptensorContractionDescriptor_t desc; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, + &desc, + &a_ms_ks, + modeA.data(), + alignmentRequirementA, + &b_ns_ks, + modeB.data(), + alignmentRequirementB, + nullptr, + nullptr, + 0, + &d_ms_ns, + modeD.data(), + alignmentRequirementD, + typeCompute)); + /************************** + * Set the algorithm to use + ***************************/ + + hiptensorContractionFind_t find; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( + handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void* workspace = nullptr; + + if(worksize > 0) + { + CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); + } + + /************************** + * Create Contraction Plan + **************************/ + std::cout << "Initializing contraction plan..." << std::endl; + + hiptensorContractionPlan_t plan; + CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); + + std::cout << "Launching contraction kernel..." << std::endl; + + CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, + &plan, + (void*)&alpha, + A_d, + B_d, + nullptr, + nullptr, + D_d, + workspace, + worksize, + 0 /* stream */)); + +#if !NDEBUG + bool printElements = false; + bool storeElements = false; + + if(printElements || storeElements) + { + CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); + } + + if(printElements) + { + if(elementsA < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor A elements:\n"; + hiptensorPrintArrayElements(std::cout, A, elementsA); + std::cout << std::endl; + } + + if(elementsB < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor B elements:\n"; + hiptensorPrintArrayElements(std::cout, B, elementsB); + std::cout << std::endl; + } + + if(elementsD < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, D, elementsD); + std::cout << std::endl; + } + } + + if(storeElements) + { + std::ofstream tensorA, tensorB, tensorD; + tensorA.open("tensor_A.txt"); + hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); + tensorA.close(); + + tensorB.open("tensor_B.txt"); + hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); + tensorB.close(); + + tensorD.open("tensor_D_scale_contraction_results.txt"); + hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); + tensorD.close(); + } + +#endif + + CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); + + HIPTENSOR_FREE_HOST(A); + HIPTENSOR_FREE_HOST(B); + HIPTENSOR_FREE_HOST(D); + + HIPTENSOR_FREE_DEVICE(A_d); + HIPTENSOR_FREE_DEVICE(B_d); + HIPTENSOR_FREE_DEVICE(D_d); + HIPTENSOR_FREE_DEVICE(workspace); + + std::cout << "Finished!" << std::endl; + + return 0; +} diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16.cpp index e05916bf..7b0f8b6c 100644 --- a/samples/01_contraction/simple_scale_contraction_bf16.cpp +++ b/samples/01_contraction/simple_scale_contraction_bf16.cpp @@ -23,16 +23,7 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_scale_contraction.hpp" int main(int argc, char* argv[]) { @@ -44,291 +35,17 @@ int main(int argc, char* argv[]) typedef hip_bfloat16 DDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_16BF; - hipDataType typeB = HIP_R_16BF; - hipDataType typeD = HIP_R_16BF; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} - **********************/ - - std::vector modeD{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeD = modeD.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector d_ms_ns_lengths; - for(auto mode : modeD) - { - d_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t d_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &d_ms_ns, - nmodeD, - d_ms_ns_lengths.data(), - NULL, /*stride*/ - typeD, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsD = std::accumulate( - d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeD = sizeof(DDataType) * elementsD; - - ADataType* A = nullptr; - BDataType* B = nullptr; - DDataType* D = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); - - void *A_d, *B_d, *D_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsD; i++) - { - D[i] = std::numeric_limits::signaling_NaN(); - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementD; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "d_ms_ns: " << d_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - nullptr, - nullptr, - 0, - &d_ms_ns, - modeD.data(), - alignmentRequirementD, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - nullptr, - nullptr, - D_d, - workspace, - worksize, - 0 /* stream */)); - - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsD < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor D elements:\n"; - hiptensorPrintArrayElements(std::cout, D, elementsD); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorD; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorD.open("tensor_D_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); - tensorD.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(D); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(D_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_16BF; + constexpr hipDataType typeB = HIP_R_16BF; + constexpr hipDataType typeD = HIP_R_16BF; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); } diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16.cpp index 1e62be85..d69193f0 100644 --- a/samples/01_contraction/simple_scale_contraction_f16.cpp +++ b/samples/01_contraction/simple_scale_contraction_f16.cpp @@ -23,312 +23,35 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common.hpp" +#include "simple_scale_contraction.hpp" int main(int argc, char* argv[]) { /*************************************** * Check device support * **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + typedef _Float16 ADataType; typedef _Float16 BDataType; typedef _Float16 DDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_16F; - hipDataType typeB = HIP_R_16F; - hipDataType typeD = HIP_R_16F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} - **********************/ - - std::vector modeD{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeD = modeD.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector d_ms_ns_lengths; - for(auto mode : modeD) - { - d_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t d_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &d_ms_ns, - nmodeD, - d_ms_ns_lengths.data(), - NULL, /*stride*/ - typeD, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsD = std::accumulate( - d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeD = sizeof(DDataType) * elementsD; - - ADataType* A = nullptr; - BDataType* B = nullptr; - DDataType* D = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); - - void *A_d, *B_d, *D_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsD; i++) - { - D[i] = std::numeric_limits::signaling_NaN(); - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementD; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "d_ms_ns: " << d_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - nullptr, - nullptr, - 0, - &d_ms_ns, - modeD.data(), - alignmentRequirementD, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - nullptr, - nullptr, - D_d, - workspace, - worksize, - 0 /* stream */)); - - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsD < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor D elements:\n"; - hiptensorPrintArrayElements(std::cout, D, elementsD); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorD; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorD.open("tensor_D_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); - tensorD.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(D); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(D_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_16F; + constexpr hipDataType typeB = HIP_R_16F; + constexpr hipDataType typeD = HIP_R_16F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); } diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32.cpp index c76ec370..e53cc468 100644 --- a/samples/01_contraction/simple_scale_contraction_f32.cpp +++ b/samples/01_contraction/simple_scale_contraction_f32.cpp @@ -23,16 +23,8 @@ * THE SOFTWARE. * *******************************************************************************/ -#include -#include -#include -#include -#include -#include -#include -#include -#include "common.hpp" +#include "simple_scale_contraction.hpp" int main(int argc, char* argv[]) { @@ -50,291 +42,17 @@ int main(int argc, char* argv[]) typedef float DDataType; typedef float floatTypeCompute; - hipDataType typeA = HIP_R_32F; - hipDataType typeB = HIP_R_32F; - hipDataType typeD = HIP_R_32F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.0f; - - /********************** - * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} - **********************/ - - std::vector modeD{'m', 'n', 'u', 'v'}; - std::vector modeA{'m', 'n', 'h', 'k'}; - std::vector modeB{'u', 'v', 'h', 'k'}; - - int nmodeA = modeA.size(); - int nmodeB = modeB.size(); - int nmodeD = modeD.size(); - - std::unordered_map extent; - - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; - - std::vector d_ms_ns_lengths; - for(auto mode : modeD) - { - d_ms_ns_lengths.push_back(extent[mode]); - } - - std::vector a_ms_ks_lengths; - for(auto mode : modeA) - { - a_ms_ks_lengths.push_back(extent[mode]); - } - - std::vector b_ns_ks_lengths; - for(auto mode : modeB) - { - b_ns_ks_lengths.push_back(extent[mode]); - } - - hiptensorHandle_t* handle; - CHECK_HIPTENSOR_ERROR(hiptensorCreate(&handle)); - - CHECK_HIPTENSOR_ERROR(hiptensorLoggerSetMask(HIPTENSOR_LOG_LEVEL_PERF_TRACE)); - - /******************************************** - * Initialize tensors with the input lengths * - ********************************************/ - hiptensorTensorDescriptor_t a_ms_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &a_ms_ks, - nmodeA, - a_ms_ks_lengths.data(), - NULL, /*stride*/ - typeA, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t b_ns_ks; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &b_ns_ks, - nmodeB, - b_ns_ks_lengths.data(), - NULL, /*stride*/ - typeB, - HIPTENSOR_OP_IDENTITY)); - - hiptensorTensorDescriptor_t d_ms_ns; - CHECK_HIPTENSOR_ERROR(hiptensorInitTensorDescriptor(handle, - &d_ms_ns, - nmodeD, - d_ms_ns_lengths.data(), - NULL, /*stride*/ - typeD, - HIPTENSOR_OP_IDENTITY)); - - /********************** - * Allocating data - **********************/ - std::cout << "Initializing host data..." << std::endl; - - size_t elementsA = std::accumulate( - a_ms_ks_lengths.begin(), a_ms_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsB = std::accumulate( - b_ns_ks_lengths.begin(), b_ns_ks_lengths.end(), size_t{1}, std::multiplies()); - size_t elementsD = std::accumulate( - d_ms_ns_lengths.begin(), d_ms_ns_lengths.end(), size_t{1}, std::multiplies()); - - size_t sizeA = sizeof(ADataType) * elementsA; - size_t sizeB = sizeof(BDataType) * elementsB; - size_t sizeD = sizeof(DDataType) * elementsD; - - ADataType* A = nullptr; - BDataType* B = nullptr; - DDataType* D = nullptr; - CHECK_HIP_ERROR(hipHostMalloc((void**)&A, sizeA)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&B, sizeB)); - CHECK_HIP_ERROR(hipHostMalloc((void**)&D, sizeD)); - - void *A_d, *B_d, *D_d; - - CHECK_HIP_ERROR(hipMalloc(static_cast(&A_d), sizeA)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&B_d), sizeB)); - CHECK_HIP_ERROR(hipMalloc(static_cast(&D_d), sizeD)); - - /******************* - * Initialize data - *******************/ - for(int64_t i = 0; i < elementsA; i++) - { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsB; i++) - { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; - } - - for(int64_t i = 0; i < elementsD; i++) - { - D[i] = std::numeric_limits::signaling_NaN(); - } - - /******************************************** - * Transfer the Host Tensor to Device Memory * - ********************************************/ - std::cout << "Initializing device data..." << std::endl; - - CHECK_HIP_ERROR(hipMemcpy(A_d, static_cast(A), sizeA, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(B_d, static_cast(B), sizeB, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemset(D_d, 0, sizeD)); - - /************************************************ - * Retrieve the memory alignment for each tensor - ************************************************/ - uint32_t alignmentRequirementA; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); - - uint32_t alignmentRequirementB; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, B_d, &b_ns_ks, &alignmentRequirementB)); - - uint32_t alignmentRequirementD; - CHECK_HIPTENSOR_ERROR( - hiptensorGetAlignmentRequirement(handle, D_d, &d_ms_ns, &alignmentRequirementD)); - - /******************************* - * Create Contraction Descriptor - *******************************/ - - std::cout << "a_ms_ks: " << a_ms_ks << std::endl; - std::cout << "b_ns_ks: " << b_ns_ks << std::endl; - std::cout << "d_ms_ns: " << d_ms_ns << std::endl; - - hiptensorContractionDescriptor_t desc; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionDescriptor(handle, - &desc, - &a_ms_ks, - modeA.data(), - alignmentRequirementA, - &b_ns_ks, - modeB.data(), - alignmentRequirementB, - nullptr, - nullptr, - 0, - &d_ms_ns, - modeD.data(), - alignmentRequirementD, - typeCompute)); - /************************** - * Set the algorithm to use - ***************************/ - - hiptensorContractionFind_t find; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionFind(handle, &find, HIPTENSOR_ALGO_DEFAULT)); - - /********************** - * Query workspace - **********************/ - - uint64_t worksize = 0; - CHECK_HIPTENSOR_ERROR(hiptensorContractionGetWorkspaceSize( - handle, &desc, &find, HIPTENSOR_WORKSPACE_RECOMMENDED, &worksize)); - - void* workspace = nullptr; - - if(worksize > 0) - { - CHECK_HIP_ERROR(hipMalloc(static_cast(&workspace), worksize)); - } - - /************************** - * Create Contraction Plan - **************************/ - std::cout << "Initializing contraction plan..." << std::endl; - - hiptensorContractionPlan_t plan; - CHECK_HIPTENSOR_ERROR(hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); - - std::cout << "Launching contraction kernel..." << std::endl; - - CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, - &plan, - (void*)&alpha, - A_d, - B_d, - nullptr, - nullptr, - D_d, - workspace, - worksize, - 0 /* stream */)); - - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - -#if !NDEBUG - bool printElements = false; - bool storeElements = false; - - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - } - - if(printElements) - { - if(elementsA < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor A elements:\n"; - hiptensorPrintArrayElements(std::cout, A, elementsA); - std::cout << std::endl; - } - - if(elementsB < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor B elements:\n"; - hiptensorPrintArrayElements(std::cout, B, elementsB); - std::cout << std::endl; - } - - if(elementsD < MAX_ELEMENTS_PRINT_COUNT) - { - std::cout << "Tensor D elements:\n"; - hiptensorPrintArrayElements(std::cout, D, elementsD); - std::cout << std::endl; - } - } - - if(storeElements) - { - std::ofstream tensorA, tensorB, tensorD; - tensorA.open("tensor_A.txt"); - hiptensorPrintElementsToFile(tensorA, A, elementsA, ", "); - tensorA.close(); - - tensorB.open("tensor_B.txt"); - hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); - tensorB.close(); - - tensorD.open("tensor_D_scale_contraction_results.txt"); - hiptensorPrintElementsToFile(tensorD, D, elementsD, ", "); - tensorD.close(); - } - -#endif - - CHECK_HIPTENSOR_ERROR(hiptensorDestroy(handle)); - - HIPTENSOR_FREE_HOST(A); - HIPTENSOR_FREE_HOST(B); - HIPTENSOR_FREE_HOST(D); - - HIPTENSOR_FREE_DEVICE(A_d); - HIPTENSOR_FREE_DEVICE(B_d); - HIPTENSOR_FREE_DEVICE(D_d); - HIPTENSOR_FREE_DEVICE(workspace); - - std::cout << "Finished!" << std::endl; - - return 0; + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); } diff --git a/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp new file mode 100644 index 00000000..c11b8ded --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef hip_bfloat16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f16.cpp new file mode 100644 index 00000000..377ee707 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_f16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef _Float16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64.cpp new file mode 100644 index 00000000..5eb94c15 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f64.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF64Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double DDataType; + typedef double floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeD = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f64_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f32.cpp new file mode 100644 index 00000000..fdec48ab --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f64_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF64Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeD = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); +} diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index a08065a0..08ddf0b2 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -3,8 +3,11 @@ Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F] - - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index b28e9a88..08ddf0b2 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -4,7 +4,10 @@ Tensor Data Types: - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index 9446157f..ce67278f 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -125,6 +125,9 @@ namespace hiptensor || (CDataType == NONE_TYPE)); EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF) || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); + EXPECT_TRUE( + (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF) + || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F)); mRunFlag &= checkDevice(DDataType); @@ -488,7 +491,11 @@ namespace hiptensor auto CDataType = testType[2]; auto DDataType = testType[3]; - auto computeType = convertToComputeType(testType[4]); + auto computeType = convertToComputeType(testType[4]); + double alphaBuf = 0.; + double betaBuf = 0.; + writeVal(&alphaBuf, computeType, alpha); + writeVal(&betaBuf, computeType, beta); CHECK_HIPTENSOR_ERROR( hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); @@ -497,20 +504,21 @@ namespace hiptensor CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, &plan, - (void*)&alpha, + (void*)&alphaBuf, resource->deviceA().get(), resource->deviceB().get(), - (void*)&beta, + (void*)&betaBuf, resource->deviceC().get(), resource->deviceD().get(), workspace, worksize, 0 /* stream */)); - CHECK_HIPTENSOR_ERROR(hiptensorContractionReference((void*)&alpha, + CHECK_HIPTENSOR_ERROR(hiptensorContractionReference(&plan, + (void*)&alphaBuf, resource->hostA().get(), resource->hostB().get(), - (void*)&beta, + (void*)&betaBuf, resource->hostC().get(), resource->hostD().get(), a_ms_ks.mLengths, From ab8d557e0e68d29c5d3b17020c5c43ef898ede8f Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 30 Nov 2023 18:46:46 +0000 Subject: [PATCH 03/12] Add placeholder for solution unique_id Solution unique_ids of Actor Critic are have not been ready yet, but we put some placeholders in the new Actor Critic to make the unit tests be able to pass. --- .../src/contraction/contraction_selection.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index 888ef4c1..68c748b0 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -72,8 +72,8 @@ namespace hiptensor * hipDataTypeSize(typeE); void * A_d, *B_d, *D_d, *E_d, *wspace; - double alpha = 0.0d; - double beta = 0.0d; + double alpha = 0.0; + double beta = 0.0; writeVal(&alpha, computeType, 1.02); writeVal(&beta, computeType, 1.03); @@ -188,7 +188,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id - unique_id = 7255639152084218514; + unique_id = 7255639152084218514ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -237,7 +237,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id - unique_id = 7255639152084218514; + unique_id = 7255639152084218514ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -286,7 +286,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id - unique_id = 8689089455041651212; + unique_id = 8689089455041651212ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -335,7 +335,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id - unique_id = 8689089455041651212; + unique_id = 8689089455041651212ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -379,6 +379,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 1078559130597702989ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -421,6 +422,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 6506383527825239632ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -463,6 +465,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 14486135440731032454ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -510,6 +513,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 11931735240548010466ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1276,6 +1280,7 @@ namespace hiptensor size_t unique_id = 0; // TODO select unique_id + unique_id = 11912251726020349830ull; if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) { @@ -1317,6 +1322,7 @@ namespace hiptensor int d6 = a_ms_ks_lengths[3]; size_t unique_id = 0; + unique_id = 15375432626310194825ull; // TODO select unique_id if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) From df27e326d15a65118a657b04c63eef37ecde946e Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Mon, 4 Dec 2023 16:12:44 +0000 Subject: [PATCH 04/12] Update contraction device instances Update contraction device instances since CK has updated them. --- ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 62 ++++++----------- ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 65 ++++++------------ ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 65 ++++++------------ ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 65 ++++++------------ ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 59 ++++++---------- ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 59 ++++++---------- ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 59 ++++++---------- ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 59 ++++++---------- ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 65 +++++++----------- ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 68 +++++++------------ ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 68 +++++++------------ ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 68 +++++++------------ ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 27 +++++++- ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 27 +++++++- ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 27 +++++++- ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 27 +++++++- ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 58 ++++++---------- ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 58 ++++++---------- ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 58 ++++++---------- ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 58 ++++++---------- 20 files changed, 460 insertions(+), 642 deletions(-) diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp index d8b80eb9..f924889f 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,42 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( @@ -89,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp index 5444adc3..ad94eb1f 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 1, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_kn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp index b20c1204..8fb870a0 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp index 2bc3d1f2..aa3e9d32 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 1, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp index a1fe1ddf..a65ae1eb 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> - // clang-format on - >; + = device_contraction_f64_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp index a635bce8..4d6ccaa8 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_kn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp index c77ffea4..071ccf62 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp index c8a96a70..d8223df7 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp index 88345e74..24d2d570 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,42 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp index 38702afd..f559dc06 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/n/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 1, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_kn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp index 735a5e34..a522052d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // m/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp index d286e2d8..be35683b 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // m/n/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 1, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp index 04176d80..dac46620 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp index 06481fc7..0830b49f 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp index 94922008..9a716ba3 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp index e70b854b..e02ac144 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -1,5 +1,28 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ // This (ifndef) is a hack to use customized behavior for buffer load rather than using default // setting Don't use this hack unless absolutely necessary! diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp index f8904a8f..6f168ee2 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> - // clang-format on - >; + = device_contraction_f64_kk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp index 56fc8b91..347a810c 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_kn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp index 231a0256..229d18c7 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp index 4fc648d4..bf1efa14 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, From f85df837f3ae885178b197f8c2435c14e9847a2c Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Mon, 4 Dec 2023 16:29:00 +0000 Subject: [PATCH 05/12] Print C in sample output 1. Initiate the data with 0.01, 0.02, ... by default 2. Print C --- .../simple_bilinear_contraction.hpp | 22 ++++++++++++++----- .../simple_scale_contraction.hpp | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/samples/01_contraction/simple_bilinear_contraction.hpp b/samples/01_contraction/simple_bilinear_contraction.hpp index aaef4a1b..27001232 100644 --- a/samples/01_contraction/simple_bilinear_contraction.hpp +++ b/samples/01_contraction/simple_bilinear_contraction.hpp @@ -154,7 +154,7 @@ int bilinearContractionSample() /******************* * Initialize data *******************/ - int initMethod = 0; // TODO read value from commandline + int initMethod = 1; // TODO read value from commandline for(int64_t i = 0; i < elementsA; i++) { if(initMethod == 0) @@ -287,11 +287,6 @@ int bilinearContractionSample() bool printElements = false; bool storeElements = false; - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - if(printElements) { if(elementsA < MAX_ELEMENTS_PRINT_COUNT) @@ -314,6 +309,15 @@ int bilinearContractionSample() hiptensorPrintArrayElements(std::cout, C, elementsC); std::cout << std::endl; } + + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } } if(storeElements) @@ -327,6 +331,12 @@ int bilinearContractionSample() hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); tensorB.close(); + tensorC.open("tensor_C.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + tensorC.open("tensor_C_scale_contraction_results.txt"); hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); tensorC.close(); diff --git a/samples/01_contraction/simple_scale_contraction.hpp b/samples/01_contraction/simple_scale_contraction.hpp index e9d482c3..78b026b6 100644 --- a/samples/01_contraction/simple_scale_contraction.hpp +++ b/samples/01_contraction/simple_scale_contraction.hpp @@ -151,7 +151,7 @@ int scaleContractionSample() /******************* * Initialize data *******************/ - int initMethod = 0; // TODO read the value from command line + int initMethod = 1; // TODO read the value from command line for(int64_t i = 0; i < elementsA; i++) { if(initMethod == 0) From 5c45a8c80dd0e90171a791bd945c0e41b84ef22d Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Tue, 5 Dec 2023 18:15:54 +0000 Subject: [PATCH 06/12] Set CK contraction instance only run once When logger level is set to HIPTENSOR_LOG_LEVEL_PERF_TRACE, we make CK instances measure the running time. The problem is that CK internally will run the contraction 10 times by default. This leads to an issues: 1. It returns wrong result for C = alpha A x B + beta C Set StreamConfig.nrepeat_ = 1, the contraction will be run once --- library/src/contraction/hiptensor_contraction.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index c7b7501b..8148eeaa 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -720,7 +720,13 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, // Perform contraction with timing if LOG_LEVEL_PERF_TRACE if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE) { - auto time = (*cSolution)(StreamConfig{stream, true}); + auto time = (*cSolution)(StreamConfig{ + stream, // stream id + true, // time_kernel + 0, // log_level + 0, // cold_niters + 1, // nrepeat + }); if(time < 0) { return HIPTENSOR_STATUS_CK_ERROR; From f631818937db143e42d444d4a0c2ce5646ad525e Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Tue, 5 Dec 2023 23:57:33 +0000 Subject: [PATCH 07/12] Fixed a bug in CPU reference 1. ck::bhalf_t cannot cast to float or double by static_cast. Use ck::type_convert() to fix it. 2. epsilon() is not good value to measure the relative difference of data. It is too small for double (eps < 10e-13). --- .../contraction_cpu_reference_impl.hpp | 17 +++++++---------- .../configs/bilinear_test_params.yaml | 2 +- .../configs/scale_test_params.yaml | 2 +- test/utils.hpp | 9 ++++----- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index ac4fc20d..a9a9d176 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -53,7 +53,6 @@ namespace hiptensor typename BDataType, typename DsDataType, typename EDataType, - typename AccDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, @@ -152,7 +151,7 @@ namespace hiptensor }; auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { - auto accum = static_cast(0); + float accum = 0.0f; auto K0 = arg.mA_ms_ks_lengths[2]; auto K1 = arg.mA_ms_ks_lengths[3]; @@ -174,8 +173,7 @@ namespace hiptensor arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]); // Mult / accum - accum - += static_cast(valA) * static_cast(valB); + accum += ck::type_convert(valA) * ck::type_convert(valB); } } @@ -184,15 +182,17 @@ namespace hiptensor if constexpr(std::is_same_v) { - arg.mOpCDE(((EDataType*)arg.mE)[indexE], accum); + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum)); } else // bilinear { // NumDTensor will be 1 due to SFINAE of this class auto indexD = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); - arg.mOpCDE( - ((EDataType*)arg.mE)[indexE], accum, ((EDataType*)(arg.mD[0]))[indexD]); + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum), + ((EDataType*)(arg.mD[0]))[indexD]); } }; @@ -323,7 +323,6 @@ namespace hiptensor typename BDataType, typename DsDataType, typename EDataType, - typename AccumDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, @@ -335,7 +334,6 @@ namespace hiptensor BDataType, DsDataType, EDataType, - AccumDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, @@ -375,7 +373,6 @@ namespace hiptensor BDataType, DsDataType, EDataType, - EDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index 08ddf0b2..eee5d7f1 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -11,7 +11,7 @@ Tensor Data Types: Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - - HIPTENSOR_ALGO_ACTOR_CRITIC + # - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index 08ddf0b2..eee5d7f1 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -11,7 +11,7 @@ Tensor Data Types: Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - - HIPTENSOR_ALGO_ACTOR_CRITIC + # - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: diff --git a/test/utils.hpp b/test/utils.hpp index ad4bb565..05daf544 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -140,7 +140,7 @@ template std::pair compareEqual(DDataType const* deviceD, DDataType const* hostD, std::size_t elementsD, - double tolerance = 100.0) + double tolerance = 0.005) { bool retval = true; double max_relative_error = 0.0; @@ -202,7 +202,7 @@ std::pair compareEqual(DDataType const* deviceD, retval = false; max_relative_error = std::numeric_limits::signaling_NaN(); } - else if(max_relative_error > (eps * tolerance)) + else if(max_relative_error > tolerance) { retval = false; } @@ -214,7 +214,7 @@ template std::pair compareEqualLaunchKernel(DDataType* deviceD, DDataType* hostD, std::size_t elementsD, - double tolerance = 100.0) + double tolerance = 0.005) { auto blockDim = dim3(1024, 1, 1); auto gridDim = dim3(ceilDiv(elementsD, blockDim.x), 1, 1); @@ -276,13 +276,12 @@ std::pair compareEqualLaunchKernel(DDataType* deviceD, auto toDouble = [](DDataType const& val) { return static_cast(static_cast(val)); }; - auto eps = toDouble(std::numeric_limits::epsilon()); if(isNaN) { retval = false; maxRelativeError = std::numeric_limits::signaling_NaN(); } - else if(maxRelativeError > (eps * tolerance)) + else if(maxRelativeError > tolerance) { retval = false; } From e5cefe79a7e4630b4e1f07edd425a6cba6fda519 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 6 Dec 2023 01:43:48 +0000 Subject: [PATCH 08/12] Add commnets --- library/src/contraction/contraction_meta_traits.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index 6a7cb35f..e66ac432 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -67,6 +67,14 @@ namespace hiptensor constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; + /* + * CK does not use hip_bfloat16, instead it use ushort(ck::bhalf_t) for cuda bhalf_t type. + * What we want here is that we can use ck::bhalf_t with ck instances and use hip_bfloat16 + * with hiptensor classes. + * + * When creating a solution, ck::bhalf_t was passed in to create ck instance. + * When registering the solution, MetaTraits will returen hip_bfloat16 to create key. + */ using ADataT = std::conditional_t, hip_bfloat16, ADataType>; using BDataT From 4345a1c5b4b32fa427a8880a944895b3947ee6dd Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 6 Dec 2023 17:14:53 +0000 Subject: [PATCH 09/12] Rename contraction sameple files The pattern of contraction sameple file is - bilinear: simple_bilinear_contraction_____compute_.cpp - scale : simple_scale_contraction____compute_.cpp --- samples/01_contraction/CMakeLists.txt | 85 ++++++++++--------- ...tion_bf16_bf16_bf16_bf16_compute_bf16.cpp} | 0 ...ntraction_f16_f16_f16_f16_compute_f16.cpp} | 0 ...traction_f32_f32_f32_f32_compute_bf16.cpp} | 0 ...ntraction_f32_f32_f32_f32_compute_f16.cpp} | 0 ...ntraction_f32_f32_f32_f32_compute_f32.cpp} | 0 ...ntraction_f64_f64_f64_f64_compute_f32.cpp} | 0 ...ntraction_f64_f64_f64_f64_compute_f64.cpp} | 0 ...ntraction_bf16_bf16_bf16_compute_bf16.cpp} | 0 ...e_contraction_f16_f16_f16_compute_f16.cpp} | 0 ..._contraction_f32_f32_f32_compute_bf16.cpp} | 0 ...e_contraction_f32_f32_f32_compute_f16.cpp} | 0 ...e_contraction_f32_f32_f32_compute_f32.cpp} | 0 ...e_contraction_f64_f64_f64_compute_f32.cpp} | 0 ...e_contraction_f64_f64_f64_compute_f64.cpp} | 0 15 files changed, 43 insertions(+), 42 deletions(-) rename samples/01_contraction/{simple_bilinear_contraction_bf16.cpp => simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f16.cpp => simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f32_bf16.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f32_f16.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f32.cpp => simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f64_f32.cpp => simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp} (100%) rename samples/01_contraction/{simple_bilinear_contraction_f64.cpp => simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_bf16.cpp => simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f16.cpp => simple_scale_contraction_f16_f16_f16_compute_f16.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f32_bf16.cpp => simple_scale_contraction_f32_f32_f32_compute_bf16.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f32_f16.cpp => simple_scale_contraction_f32_f32_f32_compute_f16.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f32.cpp => simple_scale_contraction_f32_f32_f32_compute_f32.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f64_f32.cpp => simple_scale_contraction_f64_f64_f64_compute_f32.cpp} (100%) rename samples/01_contraction/{simple_scale_contraction_f64.cpp => simple_scale_contraction_f64_f64_f64_compute_f64.cpp} (100%) diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index de834d72..00393f1d 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -26,62 +26,63 @@ # Check whether building within hiptensor context if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) - add_hiptensor_sample(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) - add_hiptensor_sample(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) - add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) - add_hiptensor_sample(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp) - add_hiptensor_sample(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp) - add_hiptensor_sample(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp) - add_hiptensor_sample(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) - add_hiptensor_sample(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) + add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) + add_hiptensor_sample(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) # If building hipTensor samples as a standalone Cmake project else() - add_executable(simple_contraction_scale_f16 simple_scale_contraction_f16.cpp) - target_link_libraries(simple_contraction_scale_f16 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp) + target_link_libraries(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_bf16 simple_scale_contraction_bf16.cpp) - target_link_libraries(simple_contraction_scale_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp) + target_link_libraries(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) - target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f32_bf16 simple_scale_contraction_f32_bf16.cpp) - target_link_libraries(simple_contraction_scale_f32_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f32_f16 simple_scale_contraction_f32_f16.cpp) - target_link_libraries(simple_contraction_scale_f32_f16 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f64 simple_scale_contraction_f64.cpp) - target_link_libraries(simple_contraction_scale_f64 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) + target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f64_f32 simple_scale_contraction_f64_f32.cpp) - target_link_libraries(simple_contraction_scale_f64_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) + target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f16 simple_bilinear_contraction_f16.cpp) - target_link_libraries(simple_contraction_bilinear_f16 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) + target_link_libraries(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_bf16 simple_bilinear_contraction_bf16.cpp) - target_link_libraries(simple_contraction_bilinear_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp) + target_link_libraries(simple_scale_contraction_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) - target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f32_bf16 simple_bilinear_contraction_f32_bf16.cpp) - target_link_libraries(simple_contraction_bilinear_f32_bf16 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f32_f16 simple_bilinear_contraction_f32_f16.cpp) - target_link_libraries(simple_contraction_bilinear_f32_f16 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f64 simple_bilinear_contraction_f64.cpp) - target_link_libraries(simple_contraction_bilinear_f64 PRIVATE hiptensor::hiptensor) + add_executable(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) + target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) + target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f64_f32 simple_bilinear_contraction_f64_f32.cpp) - target_link_libraries(simple_contraction_bilinear_f64_f32 PRIVATE hiptensor::hiptensor) endif() diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_bf16.cpp rename to samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f16.cpp rename to samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f32_bf16.cpp rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f32_f16.cpp rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f32.cpp rename to samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f64_f32.cpp rename to samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp diff --git a/samples/01_contraction/simple_bilinear_contraction_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp similarity index 100% rename from samples/01_contraction/simple_bilinear_contraction_f64.cpp rename to samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp diff --git a/samples/01_contraction/simple_scale_contraction_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_bf16.cpp rename to samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f16.cpp rename to samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f32_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f32_bf16.cpp rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f32_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f32_f16.cpp rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f32.cpp rename to samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f64_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f64_f32.cpp rename to samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp diff --git a/samples/01_contraction/simple_scale_contraction_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp similarity index 100% rename from samples/01_contraction/simple_scale_contraction_f64.cpp rename to samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp From 43f33ee5c6b40d0b4278cd1c221399eb99b16a7d Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 6 Dec 2023 21:02:53 +0000 Subject: [PATCH 10/12] Improve CPU reference accurary The relative difference between contraction result and CPU reference is less than 0.1% after the improvement. --- library/src/contraction/contraction_cpu_reference_impl.hpp | 3 ++- test/utils.hpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index a9a9d176..d21df2d3 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -173,7 +173,8 @@ namespace hiptensor arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]); // Mult / accum - accum += ck::type_convert(valA) * ck::type_convert(valB); + accum += ck::type_convert(ck::type_convert( + ck::type_convert(valA) * ck::type_convert(valB))); } } diff --git a/test/utils.hpp b/test/utils.hpp index 05daf544..f39f0fb5 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -140,7 +140,7 @@ template std::pair compareEqual(DDataType const* deviceD, DDataType const* hostD, std::size_t elementsD, - double tolerance = 0.005) + double tolerance = 0.001) { bool retval = true; double max_relative_error = 0.0; @@ -214,7 +214,7 @@ template std::pair compareEqualLaunchKernel(DDataType* deviceD, DDataType* hostD, std::size_t elementsD, - double tolerance = 0.005) + double tolerance = 0.001) { auto blockDim = dim3(1024, 1, 1); auto gridDim = dim3(ceilDiv(elementsD, blockDim.x), 1, 1); From fec9065460d2205f9b9478ccd5f69fa51d2a839e Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Wed, 6 Dec 2023 21:19:36 +0000 Subject: [PATCH 11/12] Add comments to explain how to pass alpha value --- library/src/contraction/contraction_selection.cpp | 11 ++++++++++- test/01_contraction/contraction_test.cpp | 15 ++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index 68c748b0..9b0cdf9f 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -71,7 +71,16 @@ namespace hiptensor auto sizeE = elementSpaceFromLengthsAndStrides(e_ms_ns_lengths, e_ms_ns_strides) * hipDataTypeSize(typeE); - void * A_d, *B_d, *D_d, *E_d, *wspace; + void *A_d, *B_d, *D_d, *E_d, *wspace; + + /* + * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha. + * ``` + * alphaF = hiptensor::readVal( + * alpha, convertToComputeType(HipDataType_v)); + * ``` + * Hence, the `alpha` and `bete` need to point to a ComputeData value + */ double alpha = 0.0; double beta = 0.0; writeVal(&alpha, computeType, 1.02); diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index ce67278f..76cc3033 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -491,9 +491,18 @@ namespace hiptensor auto CDataType = testType[2]; auto DDataType = testType[3]; - auto computeType = convertToComputeType(testType[4]); - double alphaBuf = 0.; - double betaBuf = 0.; + auto computeType = convertToComputeType(testType[4]); + + /* + * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha. + * ``` + * alphaF = hiptensor::readVal( + * alpha, convertToComputeType(HipDataType_v)); + * ``` + * Hence, the `alpha` and `bete` need to point to a ComputeData value + */ + double alphaBuf = 0.; + double betaBuf = 0.; writeVal(&alphaBuf, computeType, alpha); writeVal(&betaBuf, computeType, beta); From b21fe0b18881fb6ed5643be7bd2e242f9a4b45a2 Mon Sep 17 00:00:00 2001 From: Cong Ma Date: Thu, 7 Dec 2023 02:16:22 +0000 Subject: [PATCH 12/12] Update CPU reference 1. Revert the default threshold of relative difference to (100 * std::numeric_limits::epsilon()) 2. Update CPU reference to make the difference between CPU reference and output of contraction instance is less than (100 * std::numeric_limits::epsilon()). --- .../contraction_cpu_reference_impl.hpp | 29 ++++++++++++++----- .../contraction_cpu_reference_instances.cpp | 14 +++++++++ .../configs/bilinear_test_params.yaml | 2 +- .../configs/scale_test_params.yaml | 2 +- test/utils.hpp | 9 +++--- 5 files changed, 42 insertions(+), 14 deletions(-) diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index d21df2d3..2e3d0cbe 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -45,19 +45,25 @@ namespace hiptensor { // hardcoded for NumDimM == NumDimN == NumDimK == 2 + // + // ck::bhalf_t is ushort, cannot perform bhalf_t * bhalf_t + // CK does not use ck::bhalf_t as AccDataType. But we still + // add this guard here template < ck::index_t NumDimM, ck::index_t NumDimN, ck::index_t NumDimK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename ComputeDataType = ADataType, - ck::enable_if_t, bool> = false> struct ReferenceContraction_M2_N2_K2 @@ -151,7 +157,7 @@ namespace hiptensor }; auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { - float accum = 0.0f; + AccDataType accum = 0; auto K0 = arg.mA_ms_ks_lengths[2]; auto K1 = arg.mA_ms_ks_lengths[3]; @@ -165,16 +171,19 @@ namespace hiptensor auto indexB = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); - ADataType valA; - BDataType valB; + AccDataType valA; + AccDataType valB; // Element-wise ops - arg.mOpA(valA, ((ADataType*)arg.mA)[indexA]); - arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]); + arg.mOpA( + valA, + ck::type_convert(((ADataType*)arg.mA)[indexA])); + arg.mOpB( + valB, + ck::type_convert(((BDataType*)arg.mB)[indexB])); // Mult / accum - accum += ck::type_convert(ck::type_convert( - ck::type_convert(valA) * ck::type_convert(valB))); + accum += valA * valB; } } @@ -322,6 +331,7 @@ namespace hiptensor ck::index_t NumDimsK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, typename AElementwiseOperation, @@ -333,6 +343,7 @@ namespace hiptensor NumDimsK, ADataType, BDataType, + AccDataType, DsDataType, EDataType, AElementwiseOperation, @@ -359,6 +370,7 @@ namespace hiptensor ck::index_t NumDimK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, typename AElementwiseOperation, @@ -372,6 +384,7 @@ namespace hiptensor NumDimK, ADataType, BDataType, + AccDataType, DsDataType, EDataType, AElementwiseOperation, diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 173a49e9..31fb0191 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -39,6 +39,7 @@ namespace hiptensor 2, ck::half_t, ck::half_t, + float, ck::Tuple, ck::half_t, ck::tensor_operation::element_wise::PassThrough, @@ -53,6 +54,7 @@ namespace hiptensor 2, ck::bhalf_t, ck::bhalf_t, + float, ck::Tuple, ck::bhalf_t, ck::tensor_operation::element_wise::PassThrough, @@ -67,6 +69,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple, float, ck::tensor_operation::element_wise::PassThrough, @@ -80,6 +83,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple, float, ck::tensor_operation::element_wise::PassThrough, @@ -93,6 +97,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple, float, ck::tensor_operation::element_wise::PassThrough, @@ -107,6 +112,7 @@ namespace hiptensor 2, double, double, + float, ck::Tuple, double, ck::tensor_operation::element_wise::PassThrough, @@ -120,6 +126,7 @@ namespace hiptensor 2, double, double, + double, ck::Tuple, double, ck::tensor_operation::element_wise::PassThrough, @@ -134,6 +141,7 @@ namespace hiptensor 2, ck::half_t, ck::half_t, + float, ck::Tuple<>, ck::half_t, ck::tensor_operation::element_wise::PassThrough, @@ -148,6 +156,7 @@ namespace hiptensor 2, ck::bhalf_t, ck::bhalf_t, + float, ck::Tuple<>, ck::bhalf_t, ck::tensor_operation::element_wise::PassThrough, @@ -162,6 +171,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple<>, float, ck::tensor_operation::element_wise::PassThrough, @@ -175,6 +185,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple<>, float, ck::tensor_operation::element_wise::PassThrough, @@ -188,6 +199,7 @@ namespace hiptensor 2, float, float, + float, ck::Tuple<>, float, ck::tensor_operation::element_wise::PassThrough, @@ -202,6 +214,7 @@ namespace hiptensor 2, double, double, + float, ck::Tuple<>, double, ck::tensor_operation::element_wise::PassThrough, @@ -215,6 +228,7 @@ namespace hiptensor 2, double, double, + double, ck::Tuple<>, double, ck::tensor_operation::element_wise::PassThrough, diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index eee5d7f1..f4be1a88 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -29,7 +29,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 1 ] + - [ 24, 18, 2, 4, 9, 2 ] Strides: - [] ... diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index eee5d7f1..f4be1a88 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -29,7 +29,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 1 ] + - [ 24, 18, 2, 4, 9, 2 ] Strides: - [] ... diff --git a/test/utils.hpp b/test/utils.hpp index f39f0fb5..ad4bb565 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -140,7 +140,7 @@ template std::pair compareEqual(DDataType const* deviceD, DDataType const* hostD, std::size_t elementsD, - double tolerance = 0.001) + double tolerance = 100.0) { bool retval = true; double max_relative_error = 0.0; @@ -202,7 +202,7 @@ std::pair compareEqual(DDataType const* deviceD, retval = false; max_relative_error = std::numeric_limits::signaling_NaN(); } - else if(max_relative_error > tolerance) + else if(max_relative_error > (eps * tolerance)) { retval = false; } @@ -214,7 +214,7 @@ template std::pair compareEqualLaunchKernel(DDataType* deviceD, DDataType* hostD, std::size_t elementsD, - double tolerance = 0.001) + double tolerance = 100.0) { auto blockDim = dim3(1024, 1, 1); auto gridDim = dim3(ceilDiv(elementsD, blockDim.x), 1, 1); @@ -276,12 +276,13 @@ std::pair compareEqualLaunchKernel(DDataType* deviceD, auto toDouble = [](DDataType const& val) { return static_cast(static_cast(val)); }; + auto eps = toDouble(std::numeric_limits::epsilon()); if(isNaN) { retval = false; maxRelativeError = std::numeric_limits::signaling_NaN(); } - else if(maxRelativeError > tolerance) + else if(maxRelativeError > (eps * tolerance)) { retval = false; }