diff --git a/library/src/include/config.hpp b/library/include/hiptensor/internal/config.hpp similarity index 100% rename from library/src/include/config.hpp rename to library/include/hiptensor/internal/config.hpp diff --git a/library/include/hiptensor/internal/hiptensor_utility.hpp b/library/include/hiptensor/internal/hiptensor_utility.hpp index f2df2dd2..c386bbe0 100644 --- a/library/include/hiptensor/internal/hiptensor_utility.hpp +++ b/library/include/hiptensor/internal/hiptensor_utility.hpp @@ -31,6 +31,7 @@ #include #include "../hiptensor_types.hpp" +#include "types_ext.hpp" #ifndef CHECK_HIP_ERROR #define CHECK_HIP_ERROR(expression) \ diff --git a/library/src/include/native_types.hpp b/library/include/hiptensor/internal/native_types.hpp similarity index 100% rename from library/src/include/native_types.hpp rename to library/include/hiptensor/internal/native_types.hpp diff --git a/library/src/include/native_types_impl.hpp b/library/include/hiptensor/internal/native_types_impl.hpp similarity index 100% rename from library/src/include/native_types_impl.hpp rename to library/include/hiptensor/internal/native_types_impl.hpp diff --git a/library/src/include/type_traits.hpp b/library/include/hiptensor/internal/type_traits.hpp similarity index 100% rename from library/src/include/type_traits.hpp rename to library/include/hiptensor/internal/type_traits.hpp diff --git a/library/src/include/types.hpp b/library/include/hiptensor/internal/types.hpp similarity index 100% rename from library/src/include/types.hpp rename to library/include/hiptensor/internal/types.hpp diff --git a/library/src/include/types_ext.hpp b/library/include/hiptensor/internal/types_ext.hpp similarity index 100% rename from library/src/include/types_ext.hpp rename to library/include/hiptensor/internal/types_ext.hpp diff --git a/library/src/include/xfloat32.hpp b/library/include/hiptensor/internal/xfloat32.hpp similarity index 100% rename from library/src/include/xfloat32.hpp rename to library/include/hiptensor/internal/xfloat32.hpp diff --git a/library/src/contraction/contraction_cpu_reference.cpp b/library/src/contraction/contraction_cpu_reference.cpp index 13dcdffd..ac1d9711 100644 --- a/library/src/contraction/contraction_cpu_reference.cpp +++ b/library/src/contraction/contraction_cpu_reference.cpp @@ -28,31 +28,33 @@ #include "contraction_cpu_reference_impl.hpp" #include "contraction_cpu_reference_instances.hpp" -hiptensorStatus_t hiptensorContractionReference(void const* alpha, - void const* A, - void const* B, - void const* beta, - void const* C, - void* D, - std::vector const& a_ms_ks_lengths, - std::vector const& a_ms_ks_strides, - std::vector const& b_ns_ks_lengths, - std::vector const& b_ns_ks_strides, - std::vector const& c_ms_ns_lengths, - std::vector const& c_ms_ns_strides, - std::vector const& d_ms_ns_lengths, - std::vector const& d_ms_ns_strides, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - void* workspace) +hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan, + void const* alpha, + void const* A, + void const* B, + void const* beta, + void const* C, + void* D, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + std::vector const& c_ms_ns_lengths, + std::vector const& c_ms_ns_strides, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + void* workspace) { - auto& instances = hiptensor::ContractionCpuReferenceInstances::instance(); + auto& instances = hiptensor::ContractionCpuReferenceInstances::instance(); + auto computeType = plan->mContractionDesc.mComputeType; auto candidates - = (C == nullptr) - ? instances->allSolutions().query(typeA, typeB, hiptensor::NONE_TYPE, typeD) - : instances->allSolutions().query(typeA, typeB, typeC, typeD); + = (C == nullptr) ? instances->allSolutions().query( + typeA, typeB, hiptensor::NONE_TYPE, typeD, computeType) + : instances->allSolutions().query(typeA, typeB, typeC, typeD, computeType); auto toCKVec = [](auto& inputVec) { return std::vector(inputVec.begin(), inputVec.end()); }; diff --git a/library/src/contraction/contraction_cpu_reference.hpp b/library/src/contraction/contraction_cpu_reference.hpp index aadb062e..471026dc 100644 --- a/library/src/contraction/contraction_cpu_reference.hpp +++ b/library/src/contraction/contraction_cpu_reference.hpp @@ -32,24 +32,25 @@ #include -hiptensorStatus_t hiptensorContractionReference(void const* alpha, - void const* A, - void const* B, - void const* beta, - void const* C, - void* D, - std::vector const& a_ms_ks_lengths, - std::vector const& a_ms_ks_strides, - std::vector const& b_ks_ns_lengths, - std::vector const& b_ks_ns_strides, - std::vector const& c_ms_ns_lengths, - std::vector const& c_ms_ns_strides, - std::vector const& d_ms_ns_lengths, - std::vector const& d_ms_ns_strides, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - void* workspace); +hiptensorStatus_t hiptensorContractionReference(const hiptensorContractionPlan_t* plan, + void const* alpha, + void const* A, + void const* B, + void const* beta, + void const* C, + void* D, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + std::vector const& b_ks_ns_lengths, + std::vector const& b_ks_ns_strides, + std::vector const& c_ms_ns_lengths, + std::vector const& c_ms_ns_strides, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + void* workspace); #endif // HIPTENSOR_CONTRACTION_CPU_REFERENCE_HPP diff --git a/library/src/contraction/contraction_cpu_reference_impl.hpp b/library/src/contraction/contraction_cpu_reference_impl.hpp index 673f6dff..2e3d0cbe 100644 --- a/library/src/contraction/contraction_cpu_reference_impl.hpp +++ b/library/src/contraction/contraction_cpu_reference_impl.hpp @@ -45,19 +45,25 @@ namespace hiptensor { // hardcoded for NumDimM == NumDimN == NumDimK == 2 + // + // ck::bhalf_t is ushort, cannot perform bhalf_t * bhalf_t + // CK does not use ck::bhalf_t as AccDataType. But we still + // add this guard here template < ck::index_t NumDimM, ck::index_t NumDimN, ck::index_t NumDimK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, - typename AccDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, - ck::enable_if_t, bool> = false> struct ReferenceContraction_M2_N2_K2 @@ -70,7 +76,8 @@ namespace hiptensor EDataType, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation> + CDEElementwiseOperation, + ComputeDataType> { using BaseArgument = ck::tensor_operation::device::BaseArgument; using BaseInvoker = ck::tensor_operation::device::BaseInvoker; @@ -150,7 +157,7 @@ namespace hiptensor }; auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) { - auto accum = static_cast(0); + AccDataType accum = 0; auto K0 = arg.mA_ms_ks_lengths[2]; auto K1 = arg.mA_ms_ks_lengths[3]; @@ -164,16 +171,19 @@ namespace hiptensor auto indexB = offset(std::vector{n0, n1, k0, k1}, arg.mB_ns_ks_strides); - ADataType valA; - BDataType valB; + AccDataType valA; + AccDataType valB; // Element-wise ops - arg.mOpA(valA, ((ADataType*)arg.mA)[indexA]); - arg.mOpB(valB, ((BDataType*)arg.mB)[indexB]); + arg.mOpA( + valA, + ck::type_convert(((ADataType*)arg.mA)[indexA])); + arg.mOpB( + valB, + ck::type_convert(((BDataType*)arg.mB)[indexB])); // Mult / accum - accum - += static_cast(valA) * static_cast(valB); + accum += valA * valB; } } @@ -182,15 +192,17 @@ namespace hiptensor if constexpr(std::is_same_v) { - arg.mOpCDE(((EDataType*)arg.mE)[indexE], accum); + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum)); } else // bilinear { // NumDTensor will be 1 due to SFINAE of this class auto indexD = offset(std::vector{m0, m1, n0, n1}, arg.mD_ms_ns_strides[0]); - arg.mOpCDE( - ((EDataType*)arg.mE)[indexE], accum, ((EDataType*)(arg.mD[0]))[indexD]); + arg.mOpCDE(((EDataType*)arg.mE)[indexE], + ck::type_convert(accum), + ((EDataType*)(arg.mD[0]))[indexD]); } }; @@ -319,23 +331,25 @@ namespace hiptensor ck::index_t NumDimsK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, - typename AccumDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + CDEElementwiseOperation, + ComputeDataType>> : public MetaTraits< ck::tensor_operation::device::DeviceContractionMultipleD> + CDEElementwiseOperation, + ComputeDataType>> { }; @@ -355,11 +370,13 @@ namespace hiptensor ck::index_t NumDimK, typename ADataType, typename BDataType, + typename AccDataType, typename DsDataType, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType = ADataType> auto enumerateReferenceSolutions() { using ReferenceOp = ReferenceContraction_M2_N2_K2; + CDEElementwiseOperation, + ComputeDataType>; auto solution = std::make_unique>( std::make_unique()); diff --git a/library/src/contraction/contraction_cpu_reference_instances.cpp b/library/src/contraction/contraction_cpu_reference_instances.cpp index 106dd5ff..31fb0191 100644 --- a/library/src/contraction/contraction_cpu_reference_instances.cpp +++ b/library/src/contraction/contraction_cpu_reference_instances.cpp @@ -32,6 +32,36 @@ namespace hiptensor ContractionCpuReferenceInstances::ContractionCpuReferenceInstances() { // Register all the solutions exactly once + // Bilinear f16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + float, + ck::Tuple, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear bf16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + float, + ck::Tuple, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f32 registerSolutions( enumerateReferenceSolutions<2, @@ -39,11 +69,41 @@ namespace hiptensor 2, float, float, + float, ck::Tuple, float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + ck::half_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::bhalf_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); // Bilinear f64 registerSolutions( @@ -52,11 +112,57 @@ namespace hiptensor 2, double, double, + float, ck::Tuple, double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + double, + double, + double, + ck::Tuple, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + double>()); + + // Scale f16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + float, + ck::Tuple<>, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale bf16 + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + float, + ck::Tuple<>, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); // Scale f32 registerSolutions( @@ -65,11 +171,41 @@ namespace hiptensor 2, float, float, + float, ck::Tuple<>, float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + ck::half_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::bhalf_t>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + float, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); // Scale f64 registerSolutions( @@ -78,10 +214,26 @@ namespace hiptensor 2, double, double, + float, + ck::Tuple<>, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateReferenceSolutions<2, + 2, + 2, + double, + double, + double, ck::Tuple<>, double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + double>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_meta_traits.hpp b/library/src/contraction/contraction_meta_traits.hpp index 4fa7acf7..e66ac432 100644 --- a/library/src/contraction/contraction_meta_traits.hpp +++ b/library/src/contraction/contraction_meta_traits.hpp @@ -49,7 +49,8 @@ namespace hiptensor typename DsDataType, typename EDataType, typename AElementwiseOperation, - typename BElementwiseOperation> + typename BElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + ck::tensor_operation::element_wise::Bilinear, + ComputeDataType>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; - using ADataT = ADataType; - using BDataT = BDataType; - using DDataT = DsDataType; - using EDataT = EDataType; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Bilinear; + /* + * CK does not use hip_bfloat16, instead it use ushort(ck::bhalf_t) for cuda bhalf_t type. + * What we want here is that we can use ck::bhalf_t with ck instances and use hip_bfloat16 + * with hiptensor classes. + * + * When creating a solution, ck::bhalf_t was passed in to create ck instance. + * When registering the solution, MetaTraits will returen hip_bfloat16 to create key. + */ + using ADataT + = std::conditional_t, hip_bfloat16, ADataType>; + using BDataT + = std::conditional_t, hip_bfloat16, BDataType>; + using DDataT + = std::conditional_t, hip_bfloat16, DsDataType>; + using EDataT + = std::conditional_t, hip_bfloat16, EDataType>; + using ComputeDataT = std::conditional_t, + hip_bfloat16, + ComputeDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Bilinear; }; // Partial specialize for Scale contraction @@ -82,7 +99,8 @@ namespace hiptensor typename BDataType, typename EDataType, typename AElementwiseOperation, - typename BElementwiseOperation> + typename BElementwiseOperation, + typename ComputeDataType> struct MetaTraits> + ck::tensor_operation::element_wise::Scale, + ComputeDataType>> { constexpr static ck::index_t DimsM = NumDimsM; constexpr static ck::index_t DimsN = NumDimsN; constexpr static ck::index_t DimsK = NumDimsK; - using ADataT = ADataType; - using BDataT = BDataType; - using DDataT = NoneType; - using EDataT = EDataType; - using AOp = AElementwiseOperation; - using BOp = BElementwiseOperation; - using CDEOp = ck::tensor_operation::element_wise::Scale; + using ADataT + = std::conditional_t, hip_bfloat16, ADataType>; + using BDataT + = std::conditional_t, hip_bfloat16, BDataType>; + using DDataT = NoneType; + using EDataT + = std::conditional_t, hip_bfloat16, EDataType>; + using ComputeDataT = std::conditional_t, + hip_bfloat16, + ComputeDataType>; + using AOp = AElementwiseOperation; + using BOp = BElementwiseOperation; + using CDEOp = ck::tensor_operation::element_wise::Scale; }; } // namespace hiptensor diff --git a/library/src/contraction/contraction_selection.cpp b/library/src/contraction/contraction_selection.cpp index aaa624f6..9b0cdf9f 100644 --- a/library/src/contraction/contraction_selection.cpp +++ b/library/src/contraction/contraction_selection.cpp @@ -54,6 +54,7 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize) { // Make sure that we calculate full element space incase strides are not packed. @@ -71,8 +72,19 @@ namespace hiptensor * hipDataTypeSize(typeE); void *A_d, *B_d, *D_d, *E_d, *wspace; - float alpha = 1.02f; - float beta = 1.03f; + + /* + * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha. + * ``` + * alphaF = hiptensor::readVal( + * alpha, convertToComputeType(HipDataType_v)); + * ``` + * Hence, the `alpha` and `bete` need to point to a ComputeData value + */ + double alpha = 0.0; + double beta = 0.0; + writeVal(&alpha, computeType, 1.02); + writeVal(&beta, computeType, 1.03); CHECK_HIP_ALLOC(hipMalloc(&A_d, sizeA)); CHECK_HIP_ALLOC(hipMalloc(&B_d, sizeB)); @@ -151,7 +163,381 @@ namespace hiptensor } template <> - struct ActorCriticSelection + struct ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::SCALE, + float> + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 7255639152084218514ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::BILINEAR, + float> + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 7255639152084218514ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 8689089455041651212ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 8689089455041651212ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + + // TODO select unique_id + unique_id = 1078559130597702989ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + unique_id = 6506383527825239632ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + unique_id = 14486135440731032454ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + unique_id = 11931735240548010466ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -516,7 +902,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -874,7 +1260,94 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + // TODO select unique_id + unique_id = 11912251726020349830ull; + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection + { + static hiptensorStatus_t + selectWinner(ContractionSolution** winner, + std::unordered_map const& candidates, + hipDataType typeA, + std::vector const& a_ms_ks_lengths, + std::vector const& a_ms_ks_strides, + hipDataType typeB, + std::vector const& b_ns_ks_lengths, + std::vector const& b_ns_ks_strides, + hipDataType typeD, + std::vector const& d_ms_ns_lengths, + std::vector const& d_ms_ns_strides, + hipDataType typeE, + std::vector const& e_ms_ns_lengths, + std::vector const& e_ms_ns_strides, + const uint64_t workspaceSize) + { + int d1 = a_ms_ks_lengths[0]; + int d2 = a_ms_ks_lengths[1]; + int d3 = b_ns_ks_lengths[0]; + int d4 = b_ns_ks_lengths[1]; + int d5 = a_ms_ks_lengths[2]; + int d6 = a_ms_ks_lengths[3]; + + size_t unique_id = 0; + unique_id = 15375432626310194825ull; + // TODO select unique_id + + if(auto candidate = candidates.find(unique_id); candidate != candidates.end()) + { + *winner = candidate->second; + return HIPTENSOR_STATUS_SUCCESS; + } + else + { + return HIPTENSOR_STATUS_EXECUTION_FAILED; + } + } + }; + + template <> + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1149,7 +1622,7 @@ namespace hiptensor }; template <> - struct ActorCriticSelection + struct ActorCriticSelection { static hiptensorStatus_t selectWinner(ContractionSolution** winner, @@ -1416,89 +1889,344 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize) { - if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F) + if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == NONE_TYPE && typeE == HIP_R_16F + && computeType == HIP_R_32F) + { + return ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::SCALE, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16F && typeB == HIP_R_16F && typeD == HIP_R_16F && typeE == HIP_R_16F + && computeType == HIP_R_32F) + { + return ActorCriticSelection<_Float16, + _Float16, + _Float16, + _Float16, + ContractionOpId_t::BILINEAR, + float>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == NONE_TYPE + && typeE == HIP_R_16BF && computeType == HIP_R_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_16BF && typeB == HIP_R_16BF && typeD == HIP_R_16BF + && typeE == HIP_R_16BF && computeType == HIP_R_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_16F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_16F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_16BF) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F - && typeE == HIP_R_32F) + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_16BF) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE - && typeE == HIP_R_64F) + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == NONE_TYPE && typeE == HIP_R_32F + && computeType == HIP_R_32F) { - return ActorCriticSelection:: - selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_32F && typeB == HIP_R_32F && typeD == HIP_R_32F && typeE == HIP_R_32F + && computeType == HIP_R_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F + && computeType == HIP_R_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F + && computeType == HIP_R_32F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); + } + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == NONE_TYPE && typeE == HIP_R_64F + && computeType == HIP_R_64F) + { + return ActorCriticSelection::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } - else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F - && typeE == HIP_R_64F) + else if(typeA == HIP_R_64F && typeB == HIP_R_64F && typeD == HIP_R_64F && typeE == HIP_R_64F + && computeType == HIP_R_64F) { return ActorCriticSelection::selectWinner(winner, - candidates, - typeA, - a_ms_ks_lengths, - a_ms_ks_strides, - typeB, - b_ns_ks_lengths, - b_ns_ks_strides, - typeD, - d_ms_ns_lengths, - d_ms_ns_strides, - typeE, - e_ms_ns_lengths, - e_ms_ns_strides, - workspaceSize); + ContractionOpId_t::BILINEAR, + double>::selectWinner(winner, + candidates, + typeA, + a_ms_ks_lengths, + a_ms_ks_strides, + typeB, + b_ns_ks_lengths, + b_ns_ks_strides, + typeD, + d_ms_ns_lengths, + d_ms_ns_strides, + typeE, + e_ms_ns_lengths, + e_ms_ns_strides, + workspaceSize); } return HIPTENSOR_STATUS_EXECUTION_FAILED; } diff --git a/library/src/contraction/contraction_selection.hpp b/library/src/contraction/contraction_selection.hpp index 9ceb6a14..deb980d9 100644 --- a/library/src/contraction/contraction_selection.hpp +++ b/library/src/contraction/contraction_selection.hpp @@ -49,9 +49,15 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize); - template + template struct ActorCriticSelection { static hiptensorStatus_t @@ -87,6 +93,7 @@ namespace hiptensor hipDataType typeE, std::vector const& e_ms_ns_lengths, std::vector const& e_ms_ns_strides, + hiptensorComputeType_t computeType, const uint64_t workspaceSize); } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution.hpp b/library/src/contraction/contraction_solution.hpp index 0037584e..e76bb351 100644 --- a/library/src/contraction/contraction_solution.hpp +++ b/library/src/contraction/contraction_solution.hpp @@ -147,7 +147,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType> std::vector> enumerateContractionSolutions(); } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_impl.hpp b/library/src/contraction/contraction_solution_impl.hpp index 0fb5df9d..3b672fbb 100644 --- a/library/src/contraction/contraction_solution_impl.hpp +++ b/library/src/contraction/contraction_solution_impl.hpp @@ -95,11 +95,13 @@ namespace hiptensor if(alpha != nullptr) { - alphaF = hiptensor::readVal(alpha, HipDataType_v); + alphaF = hiptensor::readVal( + alpha, convertToComputeType(HipDataType_v)); } if(beta != nullptr) { - betaF = hiptensor::readVal(beta, HipDataType_v); + betaF = hiptensor::readVal( + beta, convertToComputeType(HipDataType_v)); } // CK has its own format for indices... @@ -205,7 +207,8 @@ namespace hiptensor if(alpha != nullptr) { - alphaF = hiptensor::readVal(alpha, HipDataType_v); + alphaF = hiptensor::readVal( + alpha, convertToComputeType(HipDataType_v)); } // CK has its own format for indices... @@ -274,7 +277,8 @@ namespace hiptensor typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, - typename CDEElementwiseOperation> + typename CDEElementwiseOperation, + typename ComputeDataType = ADataType> std::vector> enumerateContractionSolutions() { using ContractionOp @@ -287,7 +291,8 @@ namespace hiptensor EDataType, AElementwiseOperation, BElementwiseOperation, - CDEElementwiseOperation>; + CDEElementwiseOperation, + ComputeDataType>; using Factory = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory; diff --git a/library/src/contraction/contraction_solution_instances.cpp b/library/src/contraction/contraction_solution_instances.cpp index fd263a8b..aec12e32 100644 --- a/library/src/contraction/contraction_solution_instances.cpp +++ b/library/src/contraction/contraction_solution_instances.cpp @@ -32,6 +32,35 @@ namespace hiptensor ContractionSolutionInstances::ContractionSolutionInstances() { // Register all the solutions exactly once + + // Bilinear bf16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + + // Bilinear f16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + ck::Tuple, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + float>()); + // Bilinear f32 registerSolutions( enumerateContractionSolutions<2, @@ -43,7 +72,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::half_t>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + ck::bhalf_t>()); // Bilinear f64 registerSolutions( @@ -56,7 +112,48 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Bilinear>()); + ck::tensor_operation::element_wise::Bilinear, + float>()); + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + double, + double, + ck::Tuple, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear, + double>()); + + // Scale bf16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::bhalf_t, + ck::bhalf_t, + ck::Tuple<>, + ck::bhalf_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); + + // Scale f16 + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + ck::half_t, + ck::half_t, + ck::Tuple<>, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + float>()); // Scale f32 registerSolutions( @@ -69,8 +166,34 @@ namespace hiptensor float, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::half_t>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + float, + float, + ck::Tuple<>, + float, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + ck::bhalf_t>()); // Scale f64 registerSolutions( enumerateContractionSolutions<2, @@ -82,6 +205,20 @@ namespace hiptensor double, ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::Scale>()); + ck::tensor_operation::element_wise::Scale, + float>()); + + registerSolutions( + enumerateContractionSolutions<2, + 2, + 2, + double, + double, + ck::Tuple<>, + double, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale, + double>()); } } // namespace hiptensor diff --git a/library/src/contraction/contraction_solution_params.hpp b/library/src/contraction/contraction_solution_params.hpp index ec9de45c..4c44de88 100644 --- a/library/src/contraction/contraction_solution_params.hpp +++ b/library/src/contraction/contraction_solution_params.hpp @@ -49,10 +49,11 @@ namespace hiptensor virtual int32_t dimsK() const = 0; // Map to hipDataType - virtual hipDataType typeA() const = 0; - virtual hipDataType typeB() const = 0; - virtual hipDataType typeC() const = 0; - virtual hipDataType typeD() const = 0; + virtual hipDataType typeA() const = 0; + virtual hipDataType typeB() const = 0; + virtual hipDataType typeC() const = 0; + virtual hipDataType typeD() const = 0; + virtual hiptensorComputeType_t typeCompute() const = 0; // Map to operators virtual hiptensorOperator_t opA() const = 0; diff --git a/library/src/contraction/contraction_solution_params_impl.hpp b/library/src/contraction/contraction_solution_params_impl.hpp index bff33960..b84f9c2b 100644 --- a/library/src/contraction/contraction_solution_params_impl.hpp +++ b/library/src/contraction/contraction_solution_params_impl.hpp @@ -42,6 +42,7 @@ namespace std return hiptensor::Hash{}(s.dimsM(), s.dimsN(), s.dimsK(), + s.typeCompute(), s.typeA(), s.typeB(), s.typeC(), @@ -102,6 +103,11 @@ namespace hiptensor return HipDataType_v; } + hiptensorComputeType_t typeCompute() const override + { + return convertToComputeType(HipDataType_v); + } + hiptensorOperator_t opA() const override { return ElementWiseOperatorType_v; diff --git a/library/src/contraction/contraction_solution_registry.cpp b/library/src/contraction/contraction_solution_registry.cpp index 83674c81..9e2da1f9 100644 --- a/library/src/contraction/contraction_solution_registry.cpp +++ b/library/src/contraction/contraction_solution_registry.cpp @@ -53,19 +53,20 @@ namespace hiptensor } ContractionSolutionRegistry::Query - ContractionSolutionRegistry::Query::query(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) const + ContractionSolutionRegistry::Query::query(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) const { - auto solutionHash - = hashSolution(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE); + auto solutionHash = hashSolution( + dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute); if(auto solutions = mSolutionHash.find(solutionHash); solutions != mSolutionHash.end()) { @@ -81,10 +82,14 @@ namespace hiptensor return query(hashDimsMNK(dimsM, dimsN, dimsK)); } - ContractionSolutionRegistry::Query ContractionSolutionRegistry::Query::query( - hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) const + ContractionSolutionRegistry::Query + ContractionSolutionRegistry::Query::query(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) const { - return query(hashTypesABCD(typeA, typeB, typeC, typeD)); + return query(hashTypesComputeABCD(typeA, typeB, typeC, typeD, typeCompute)); } ContractionSolutionRegistry::Query @@ -159,18 +164,20 @@ namespace hiptensor /* static */ ContractionSolutionRegistry::Query::HashId - ContractionSolutionRegistry::Query::hashSolution(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) + ContractionSolutionRegistry::Query::hashSolution(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) { - return Hash{}(dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE); + return Hash{}( + dimsM, dimsN, dimsK, typeA, typeB, typeC, typeD, opA, opB, opCDE, typeCompute); } /* static */ @@ -181,10 +188,14 @@ namespace hiptensor } /* static */ - ContractionSolutionRegistry::Query::HashId ContractionSolutionRegistry::Query::hashTypesABCD( - hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD) + ContractionSolutionRegistry::Query::HashId + ContractionSolutionRegistry::Query::hashTypesComputeABCD(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) { - return Hash{}(typeA, typeB, typeC, typeD); + return Hash{}(typeA, typeB, typeC, typeD, typeCompute); } /* static */ @@ -220,12 +231,16 @@ namespace hiptensor params->typeD(), params->opA(), params->opB(), - params->opCDE()); + params->opCDE(), + params->typeCompute()); auto dimsMNKHash = hashDimsMNK(params->dimsM(), params->dimsN(), params->dimsK()); - auto typesABCDHash - = hashTypesABCD(params->typeA(), params->typeB(), params->typeC(), params->typeD()); + auto typesComputeABCDHash = hashTypesComputeABCD(params->typeA(), + params->typeB(), + params->typeC(), + params->typeD(), + params->typeCompute()); auto elementOpsHash = hashElementOps(params->opA(), params->opB()); @@ -236,7 +251,7 @@ namespace hiptensor mAllSolutions[solutionUid] = solution; mSolutionHash[solutionHash].push_back(solution); mSolutionHash[dimsMNKHash].push_back(solution); - mSolutionHash[typesABCDHash].push_back(solution); + mSolutionHash[typesComputeABCDHash].push_back(solution); mSolutionHash[elementOpsHash].push_back(solution); mSolutionHash[contactionOpsHash].push_back(solution); } diff --git a/library/src/contraction/contraction_solution_registry.hpp b/library/src/contraction/contraction_solution_registry.hpp index d1b80ec5..44aaa97d 100644 --- a/library/src/contraction/contraction_solution_registry.hpp +++ b/library/src/contraction/contraction_solution_registry.hpp @@ -59,25 +59,27 @@ namespace hiptensor /// E.g. in this context, query further parameters. // By full solution type - Query query(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE) const; + Query query(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute) const; // By dimensions Query query(int32_t dimsM, int32_t dimsN, int32_t dimsK) const; // By data types - Query query(hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD) const; + Query query(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute) const; // By element-wise operations Query query(hiptensorOperator_t opA, hiptensorOperator_t opB) const; @@ -104,22 +106,24 @@ namespace hiptensor Query query(HashId queryHash) const; // Hashing helpers - static HashId hashSolution(int32_t dimsM, - int32_t dimsN, - int32_t dimsK, - hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD, - hiptensorOperator_t opA, - hiptensorOperator_t opB, - ContractionOpId_t opCDE); + static HashId hashSolution(int32_t dimsM, + int32_t dimsN, + int32_t dimsK, + hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorOperator_t opA, + hiptensorOperator_t opB, + ContractionOpId_t opCDE, + hiptensorComputeType_t typeCompute); static HashId hashDimsMNK(int32_t dimsM, int32_t dimsN, int32_t dimsK); - static HashId hashTypesABCD(hipDataType typeA, - hipDataType typeB, - hipDataType typeC, - hipDataType typeD); + static HashId hashTypesComputeABCD(hipDataType typeA, + hipDataType typeB, + hipDataType typeC, + hipDataType typeD, + hiptensorComputeType_t typeCompute); static HashId hashElementOps(hiptensorOperator_t opA, hiptensorOperator_t opB); static HashId hashContractionOps(ContractionOpId_t opCDE); diff --git a/library/src/contraction/device/CMakeLists.txt b/library/src/contraction/device/CMakeLists.txt index f2e4a0fb..eacac5b1 100644 --- a/library/src/contraction/device/CMakeLists.txt +++ b/library/src/contraction/device/CMakeLists.txt @@ -24,24 +24,64 @@ # ############################################################################### -set(CK_CONTRACTION_INSTANCE_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp -) + set(CK_CONTRACTION_INSTANCE_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp + ) add_hiptensor_component(hiptensor_contraction_instances ${CK_CONTRACTION_INSTANCE_SOURCES}) target_include_directories(hiptensor_contraction_instances PRIVATE ${composable_kernel_INCLUDES}) diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..3b3f6d47 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..fd43f0ad --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..21fb8127 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..cc975c03 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..ff670630 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..be8bfe84 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..4be69898 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..2f6d630b --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp new file mode 100644 index 00000000..cc21216c --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp new file mode 100644 index 00000000..57c47457 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp new file mode 100644 index 00000000..a121fbb3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp new file mode 100644 index 00000000..7962da9f --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp new file mode 100644 index 00000000..ea2be147 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp new file mode 100644 index 00000000..d82ea442 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp new file mode 100644 index 00000000..772df2e3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp new file mode 100644 index 00000000..8b1d0681 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp index d8b80eb9..f924889f 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,42 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( @@ -89,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp index 5444adc3..ad94eb1f 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 1, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_kn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp index b20c1204..8fb870a0 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp index 2bc3d1f2..aa3e9d32 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using F32_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 1, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, F32_Tuple, F32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( @@ -92,8 +71,8 @@ namespace ck F32, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp new file mode 100644 index 00000000..ad5ce461 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp new file mode 100644 index 00000000..ae3ee856 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp new file mode 100644 index 00000000..b72005ad --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp new file mode 100644 index 00000000..b94030e5 --- /dev/null +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp index a1fe1ddf..a65ae1eb 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> - // clang-format on - >; + = device_contraction_f64_kk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp index a635bce8..4d6ccaa8 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_kn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp index c77ffea4..071ccf62 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mk_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp index c8a96a70..d8223df7 100644 --- a/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using F64_Tuple = ck::Tuple; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Bilinear = ck::tensor_operation::element_wise::Bilinear; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mn_instance; void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( @@ -86,8 +71,8 @@ namespace ck F64, PassThrough, PassThrough, - Bilinear>>>& - instances) + Bilinear, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..1da8301f --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..82c17500 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..1febb560 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..02b9d719 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..5917e466 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..216f470e --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..3401b605 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..fe2fa97d --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp new file mode 100644 index 00000000..9a104075 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp new file mode 100644 index 00000000..6a7f565f --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp new file mode 100644 index 00000000..094655bb --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp new file mode 100644 index 00000000..583b5b00 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp new file mode 100644 index 00000000..8eec79cf --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance + = device_contraction_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp new file mode 100644 index 00000000..a8999be8 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance + = device_contraction_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp new file mode 100644 index 00000000..e4e4b7de --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance + = device_contraction_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp new file mode 100644 index 00000000..a641f6e3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance + = device_contraction_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp index 88345e74..24d2d570 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,42 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 4, 4, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 4, 4, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 8>, 4> - // clang-format on - >; + = device_contraction_kk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp index 38702afd..f559dc06 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // k/n/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 1, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 1, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 1, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 1, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 1, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_kn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp index 735a5e34..a522052d 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // m/k/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp index d286e2d8..be35683b 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,45 +46,19 @@ namespace ck namespace instance { - using F32 = float; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1] - // m/n/n are the fast changing dimension for A/B/E + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 256, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 1, 1, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 256, 16, 4, 4, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 1, 1, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 128, 16, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 8>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 1, 1, 1, S<1, 8, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 4, 4, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, 0, 1, 1, S<1, 16, 1, 16>, 4>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F32, F32, F32, F32, Empty_Tuple, F32, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 4, 4, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 16>, 4> - // clang-format on - >; + = device_contraction_mn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( std::vector>>& instances) + Scale, + F32>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp new file mode 100644 index 00000000..dac46620 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance + = device_contraction_f64_kk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp new file mode 100644 index 00000000..0830b49f --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // k/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance + = device_contraction_f64_kn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp new file mode 100644 index 00000000..9a716ba3 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/k/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance + = device_contraction_f64_mk_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp new file mode 100644 index 00000000..e02ac144 --- /dev/null +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck +{ + namespace tensor_operation + { + namespace device + { + namespace instance + { + + // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] + // m/n/n/n are the fast changing dimension for A/B/D/E + using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance + = device_contraction_f64_mn_instance; + + void + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance( + std::vector>>& instances) + { + add_device_operation_instances( + instances, + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{}); + } + + } // namespace instance + } // namespace device + } // namespace tensor_operation +} // namespace ck diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp index f8904a8f..6f168ee2 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> - // clang-format on - >; + = device_contraction_f64_kk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp index 56fc8b91..347a810c 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // k/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_kn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp index 231a0256..229d18c7 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/k/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mk_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp index 4fc648d4..bf1efa14 100644 --- a/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp +++ b/library/src/contraction/device/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -24,13 +24,18 @@ * *******************************************************************************/ -// This (ifndef) is a hack to use customized behavior for buffer load rather -// than using default setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter -// of each device op +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 -#include "common.hpp" +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -41,39 +46,19 @@ namespace ck namespace instance { - using F64 = double; - using Empty_Tuple = ck::Tuple<>; - - template - using S = ck::Sequence; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using Scale = ck::tensor_operation::element_wise::Scale; - - static constexpr auto GemmMNKPadding - = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - // A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] // m/n/n/n are the fast changing dimension for A/B/D/E using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance - = std::tuple< - // clang-format off - //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, - DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> - // clang-format on - >; + = device_contraction_f64_mn_instance; void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( std::vector>>& instances) + Scale, + F64>>>& instances) { add_device_operation_instances( instances, diff --git a/library/src/contraction/device/contraction_bilinear.hpp b/library/src/contraction/device/hiptensor_contraction_bilinear.hpp similarity index 100% rename from library/src/contraction/device/contraction_bilinear.hpp rename to library/src/contraction/device/hiptensor_contraction_bilinear.hpp diff --git a/library/src/contraction/device/contraction_scale.hpp b/library/src/contraction/device/hiptensor_contraction_scale.hpp similarity index 100% rename from library/src/contraction/device/contraction_scale.hpp rename to library/src/contraction/device/hiptensor_contraction_scale.hpp diff --git a/library/src/contraction/hiptensor_contraction.cpp b/library/src/contraction/hiptensor_contraction.cpp index 09f5ddf6..8148eeaa 100644 --- a/library/src/contraction/hiptensor_contraction.cpp +++ b/library/src/contraction/hiptensor_contraction.cpp @@ -242,17 +242,6 @@ hiptensorStatus_t hiptensorInitContractionFind(const hiptensorHandle_t* handl auto& instances = hiptensor::ContractionSolutionInstances::instance(); auto solnQ = instances->allSolutions(); - // Check if the current device supports F64 - if(!currentDevice.supportsF64()) - { - // Allow only supported f32 combos - solnQ = solnQ.query(HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F) || // Bilinear F32 - solnQ.query(HIP_R_32F, - HIP_R_32F, - hipDataType(hiptensor::NONE_TYPE), - HIP_R_32F); // Scale F32 (no C) - } - // Can do more checking for scale / bilinear, etc. if we need to. if(solnQ.solutionCount() == 0) @@ -461,15 +450,16 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* // Convert to concrete contraction solutions auto candidates = toContractionSolutionVec(find->mCandidates); - auto ADataType = desc->mTensorDesc[0].mType; - auto BDataType = desc->mTensorDesc[1].mType; - auto DDataType = desc->mTensorDesc[2].mType; - auto EDataType = desc->mTensorDesc[3].mType; + auto computeType = desc->mComputeType; + auto ADataType = desc->mTensorDesc[0].mType; + auto BDataType = desc->mTensorDesc[1].mType; + auto DDataType = desc->mTensorDesc[2].mType; + auto EDataType = desc->mTensorDesc[3].mType; // Query contraction solutions for the correct contraction operation and type auto solutionQ = hiptensor::ContractionSolutionRegistry::Query{candidates} .query((hiptensor::ContractionOpId_t)desc->mContractionOpId) - .query(ADataType, BDataType, DDataType, EDataType); + .query(ADataType, BDataType, DDataType, EDataType, computeType); candidates = toContractionSolutionVec(solutionQ.solutions()); @@ -500,6 +490,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* EDataType, desc->mTensorDesc[3].mLengths, desc->mTensorDesc[3].mStrides, + desc->mComputeType, workspaceSize); } else if(find->mSelectionAlgorithm == HIPTENSOR_ALGO_ACTOR_CRITIC) @@ -518,6 +509,7 @@ hiptensorStatus_t hiptensorInitContractionPlan(const hiptensorHandle_t* EDataType, desc->mTensorDesc[3].mLengths, desc->mTensorDesc[3].mStrides, + desc->mComputeType, workspaceSize); } @@ -582,18 +574,9 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F) - { - snprintf( - alphaMsg, sizeof(alphaMsg), "alpha=%.6f", *(static_cast(alpha))); - } - else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F) - { - snprintf(alphaMsg, - sizeof(alphaMsg), - "alpha=%.6lf", - *(static_cast(alpha))); - } + auto alphaValue + = hiptensor::readVal(alpha, plan->mContractionDesc.mComputeType); + snprintf(alphaMsg, sizeof(alphaMsg), "alpha=%.6lf", alphaValue); } if(beta == nullptr) @@ -602,15 +585,8 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, } else { - if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_32F) - { - snprintf(betaMsg, sizeof(betaMsg), "beta=%.6f", *(static_cast(beta))); - } - else if(plan->mContractionDesc.mComputeType == HIPTENSOR_COMPUTE_64F) - { - snprintf( - betaMsg, sizeof(betaMsg), "beta=%.6lf", *(static_cast(beta))); - } + auto betaValue = hiptensor::readVal(beta, plan->mContractionDesc.mComputeType); + snprintf(betaMsg, sizeof(betaMsg), "beta=%.6lf", betaValue); } } else @@ -708,17 +684,6 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, return errorCode; } - if(plan->mContractionDesc.mComputeType != plan->mContractionDesc.mTensorDesc[3].mType) - { - auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; - snprintf(msg, - sizeof(msg), - "Internal Error : compute type != D type (%s)", - hiptensorGetErrorString(errorCode)); - logger->logError("hiptensorContraction", msg); - return errorCode; - } - auto* cSolution = (hiptensor::ContractionSolution*)(plan->mSolution); auto canRun = cSolution->initArgs(alpha, @@ -755,7 +720,17 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, // Perform contraction with timing if LOG_LEVEL_PERF_TRACE if(logger->getLogMask() & HIPTENSOR_LOG_LEVEL_PERF_TRACE) { - auto time = (*cSolution)(StreamConfig{stream, true}); + auto time = (*cSolution)(StreamConfig{ + stream, // stream id + true, // time_kernel + 0, // log_level + 0, // cold_niters + 1, // nrepeat + }); + if(time < 0) + { + return HIPTENSOR_STATUS_CK_ERROR; + } int32_t m, n, k; std::tie(m, n, k) = cSolution->problemDims(); @@ -784,7 +759,10 @@ hiptensorStatus_t hiptensorContraction(const hiptensorHandle_t* handle, // Perform contraction without timing else { - (*cSolution)(StreamConfig{stream, false}); + if((*cSolution)(StreamConfig{stream, false}) < 0) + { + return HIPTENSOR_STATUS_CK_ERROR; + } } return HIPTENSOR_STATUS_SUCCESS; diff --git a/library/src/data_types.cpp b/library/src/data_types.cpp index b270973d..38e9f186 100644 --- a/library/src/data_types.cpp +++ b/library/src/data_types.cpp @@ -132,6 +132,49 @@ namespace hiptensor } } + void writeVal(void const* addr, hiptensorComputeType_t id, double value) + { + if(id == HIPTENSOR_COMPUTE_16F) + { + *(_Float16*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_16BF) + { + *(hip_bfloat16*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32F) + { + *(float*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_64F) + { + *(double*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_8U) + { + *(uint8_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_8I) + { + *(int8_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32U) + { + *(uint32_t*)addr = value; + } + else if(id == HIPTENSOR_COMPUTE_32I) + { + *(int32_t*)addr = value; + } + else + { +#if !NDEBUG + std::cout << "Unhandled hiptensorComputeType_t: " << id << std::endl; +#endif // !NDEBUG + return; + } + } + } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType) diff --git a/library/src/hiptensor.cpp b/library/src/hiptensor.cpp index 9740d2a8..51af1f48 100644 --- a/library/src/hiptensor.cpp +++ b/library/src/hiptensor.cpp @@ -152,7 +152,8 @@ hiptensorStatus_t hiptensorInitTensorDescriptor(const hiptensorHandle_t* han } if((lens == nullptr) - || ((dataType != HIP_R_16F) && (dataType != HIP_R_32F) && (dataType != HIP_R_64F)) + || ((dataType != HIP_R_16F) && (dataType != HIP_R_16BF) && (dataType != HIP_R_32F) + && (dataType != HIP_R_64F)) || unaryOp != HIPTENSOR_OP_IDENTITY) { auto errorCode = HIPTENSOR_STATUS_INVALID_VALUE; diff --git a/library/src/include/data_types.hpp b/library/src/include/data_types.hpp index 42197650..19ccca6c 100644 --- a/library/src/include/data_types.hpp +++ b/library/src/include/data_types.hpp @@ -65,6 +65,8 @@ namespace hiptensor template T readVal(void const* value, hiptensorComputeType_t id); + void writeVal(void const* addr, hiptensorComputeType_t id, double value); + } // namespace hiptensor bool operator==(hipDataType hipType, hiptensorComputeType_t computeType); diff --git a/samples/01_contraction/CMakeLists.txt b/samples/01_contraction/CMakeLists.txt index ada3ce61..00393f1d 100644 --- a/samples/01_contraction/CMakeLists.txt +++ b/samples/01_contraction/CMakeLists.txt @@ -26,16 +26,63 @@ # Check whether building within hiptensor context if( CMAKE_PROJECT_NAME STREQUAL "hiptensor" ) - add_hiptensor_sample(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) - add_hiptensor_sample(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) + add_hiptensor_sample(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) + add_hiptensor_sample(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) + add_hiptensor_sample(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) + add_hiptensor_sample(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) + add_hiptensor_sample(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) # If building hipTensor samples as a standalone Cmake project else() + add_executable(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp) + target_link_libraries(simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_scale_f32 simple_scale_contraction_f32.cpp) - target_link_libraries(simple_contraction_scale_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp) + target_link_libraries(simple_bilinear_contraction_f16_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor) - add_executable(simple_contraction_bilinear_f32 simple_bilinear_contraction_f32.cpp) - target_link_libraries(simple_contraction_bilinear_f32 PRIVATE hiptensor::hiptensor) + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp) + target_link_libraries(simple_bilinear_contraction_f32_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp) + target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp) + target_link_libraries(simple_bilinear_contraction_f64_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp) + target_link_libraries(simple_scale_contraction_bf16_bf16_bf16_compute_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f16_f16_f16_compute_f16 simple_scale_contraction_f16_f16_f16_compute_f16.cpp) + target_link_libraries(simple_scale_contraction_f16_f16_f16_compute_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f32_f32_f32_compute_bf16 simple_scale_contraction_f32_f32_f32_compute_bf16.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_bf16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f32_f32_f32_compute_f16 simple_scale_contraction_f32_f32_f32_compute_f16.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f16 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f32_f32_f32_compute_f32 simple_scale_contraction_f32_f32_f32_compute_f32.cpp) + target_link_libraries(simple_scale_contraction_f32_f32_f32_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f64_f64_f64_compute_f32 simple_scale_contraction_f64_f64_f64_compute_f32.cpp) + target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f32 PRIVATE hiptensor::hiptensor) + + add_executable(simple_scale_contraction_f64_f64_f64_compute_f64 simple_scale_contraction_f64_f64_f64_compute_f64.cpp) + target_link_libraries(simple_scale_contraction_f64_f64_f64_compute_f64 PRIVATE hiptensor::hiptensor) endif() diff --git a/samples/01_contraction/simple_bilinear_contraction_f32.cpp b/samples/01_contraction/simple_bilinear_contraction.hpp similarity index 87% rename from samples/01_contraction/simple_bilinear_contraction_f32.cpp rename to samples/01_contraction/simple_bilinear_contraction.hpp index 5704a59d..27001232 100644 --- a/samples/01_contraction/simple_bilinear_contraction_f32.cpp +++ b/samples/01_contraction/simple_bilinear_contraction.hpp @@ -28,35 +28,23 @@ #include #include #include -#include #include #include #include #include "common.hpp" -int main(int argc, char* argv[]) +template +int bilinearContractionSample() { - /*************************************** - * Check device support * - **************************************/ - if(!isF32Supported()) - { - std::cout << "unsupported host device" << std::endl; - exit(EXIT_FAILURE); - } - - typedef float ADataType; - typedef float BDataType; - typedef float CDataType; - typedef float floatTypeCompute; - - hipDataType typeA = HIP_R_32F; - hipDataType typeB = HIP_R_32F; - hipDataType typeC = HIP_R_32F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - - floatTypeCompute alpha = (floatTypeCompute)1.1f; + floatTypeCompute alpha = (floatTypeCompute)1.0f; floatTypeCompute beta = (floatTypeCompute)1.0f; /********************** @@ -74,12 +62,12 @@ int main(int argc, char* argv[]) std::unordered_map extent; - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; + extent['m'] = 4; + extent['n'] = 3; + extent['u'] = 4; + extent['v'] = 3; + extent['h'] = 6; + extent['k'] = 5; std::vector c_ms_ns_lengths; for(auto mode : modeC) @@ -166,19 +154,41 @@ int main(int argc, char* argv[]) /******************* * Initialize data *******************/ + int initMethod = 1; // TODO read value from commandline for(int64_t i = 0; i < elementsA; i++) { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + A[i] = (ADataType)(float(i) / 100); + } } for(int64_t i = 0; i < elementsB; i++) { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + B[i] = (BDataType)(float(i) / 100); + } } for(int64_t i = 0; i < elementsC; i++) { - C[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + C[i] = CDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + C[i] = (BDataType)(float(i) / 100); + } } /******************************************** @@ -193,7 +203,6 @@ int main(int argc, char* argv[]) /************************************************ * Retrieve the memory alignment for each tensor ************************************************/ - uint32_t alignmentRequirementA; CHECK_HIPTENSOR_ERROR( hiptensorGetAlignmentRequirement(handle, A_d, &a_ms_ks, &alignmentRequirementA)); @@ -278,11 +287,6 @@ int main(int argc, char* argv[]) bool printElements = false; bool storeElements = false; - if(printElements || storeElements) - { - CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); - } - if(printElements) { if(elementsA < MAX_ELEMENTS_PRINT_COUNT) @@ -305,6 +309,15 @@ int main(int argc, char* argv[]) hiptensorPrintArrayElements(std::cout, C, elementsC); std::cout << std::endl; } + + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + + if(elementsC < MAX_ELEMENTS_PRINT_COUNT) + { + std::cout << "Tensor D elements:\n"; + hiptensorPrintArrayElements(std::cout, C, elementsC); + std::cout << std::endl; + } } if(storeElements) @@ -318,6 +331,12 @@ int main(int argc, char* argv[]) hiptensorPrintElementsToFile(tensorB, B, elementsB, ", "); tensorB.close(); + tensorC.open("tensor_C.txt"); + hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); + tensorC.close(); + + CHECK_HIP_ERROR(hipMemcpy(C, C_d, sizeC, hipMemcpyDeviceToHost)); + tensorC.open("tensor_C_scale_contraction_results.txt"); hiptensorPrintElementsToFile(tensorC, C, elementsC, ", "); tensorC.close(); diff --git a/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp new file mode 100644 index 00000000..f6714a2f --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_bf16_bf16_bf16_bf16_compute_bf16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef hip_bfloat16 ADataType; + typedef hip_bfloat16 BDataType; + typedef hip_bfloat16 CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_16BF; + constexpr hipDataType typeB = HIP_R_16BF; + constexpr hipDataType typeC = HIP_R_16BF; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp new file mode 100644 index 00000000..40708c77 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f16_f16_f16_f16_compute_f16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef _Float16 ADataType; + typedef _Float16 BDataType; + typedef _Float16 CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_16F; + constexpr hipDataType typeB = HIP_R_16F; + constexpr hipDataType typeC = HIP_R_16F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp new file mode 100644 index 00000000..42f60ecb --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_bf16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef hip_bfloat16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp new file mode 100644 index 00000000..d39a4fca --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef _Float16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp new file mode 100644 index 00000000..ee046145 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f32_f32_f32_f32_compute_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeC = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp new file mode 100644 index 00000000..673c4768 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double CDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeC = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp new file mode 100644 index 00000000..412ebbc5 --- /dev/null +++ b/samples/01_contraction/simple_bilinear_contraction_f64_f64_f64_f64_compute_f64.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_bilinear_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double CDataType; + typedef double floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeC = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + + return bilinearContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32.cpp b/samples/01_contraction/simple_scale_contraction.hpp similarity index 92% rename from samples/01_contraction/simple_scale_contraction_f32.cpp rename to samples/01_contraction/simple_scale_contraction.hpp index c76ec370..78b026b6 100644 --- a/samples/01_contraction/simple_scale_contraction_f32.cpp +++ b/samples/01_contraction/simple_scale_contraction.hpp @@ -34,29 +34,17 @@ #include "common.hpp" -int main(int argc, char* argv[]) +template +int scaleContractionSample() { - /*************************************** - * Check device support * - **************************************/ - if(!isF32Supported()) - { - std::cout << "unsupported host device" << std::endl; - exit(EXIT_FAILURE); - } - - typedef float ADataType; - typedef float BDataType; - typedef float DDataType; - typedef float floatTypeCompute; - - hipDataType typeA = HIP_R_32F; - hipDataType typeB = HIP_R_32F; - hipDataType typeD = HIP_R_32F; - hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; - floatTypeCompute alpha = (floatTypeCompute)1.0f; - /********************** * Computing: C_{m,n,u,v} = A_{m,n,h,k} B_{h,k,u,v} **********************/ @@ -71,12 +59,12 @@ int main(int argc, char* argv[]) std::unordered_map extent; - extent['m'] = 5; - extent['n'] = 6; - extent['u'] = 3; - extent['v'] = 4; - extent['h'] = 3; - extent['k'] = 4; + extent['m'] = 4; + extent['n'] = 3; + extent['u'] = 4; + extent['v'] = 3; + extent['h'] = 6; + extent['k'] = 5; std::vector d_ms_ns_lengths; for(auto mode : modeD) @@ -163,14 +151,29 @@ int main(int argc, char* argv[]) /******************* * Initialize data *******************/ + int initMethod = 1; // TODO read the value from command line for(int64_t i = 0; i < elementsA; i++) { - A[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + A[i] = ADataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + A[i] = (ADataType)(float(i) / 100); + } } for(int64_t i = 0; i < elementsB; i++) { - B[i] = ((float(std::rand())) / float(RAND_MAX) - 0.5) * 100; + if(initMethod == 0) + { + B[i] = BDataType(float(std::rand()) / float(RAND_MAX) - 0.5) * 100; + } + else + { + B[i] = (BDataType)(float(i) / 100); + } } for(int64_t i = 0; i < elementsD; i++) @@ -270,8 +273,6 @@ int main(int argc, char* argv[]) worksize, 0 /* stream */)); - CHECK_HIP_ERROR(hipMemcpy(D, D_d, sizeD, hipMemcpyDeviceToHost)); - #if !NDEBUG bool printElements = false; bool storeElements = false; diff --git a/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp new file mode 100644 index 00000000..7b0f8b6c --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_bf16_bf16_bf16_compute_bf16.cpp @@ -0,0 +1,51 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + typedef hip_bfloat16 ADataType; + typedef hip_bfloat16 BDataType; + typedef hip_bfloat16 DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_16BF; + constexpr hipDataType typeB = HIP_R_16BF; + constexpr hipDataType typeD = HIP_R_16BF; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp new file mode 100644 index 00000000..d69193f0 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f16_f16_f16_compute_f16.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef _Float16 ADataType; + typedef _Float16 BDataType; + typedef _Float16 DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_16F; + constexpr hipDataType typeB = HIP_R_16F; + constexpr hipDataType typeD = HIP_R_16F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp new file mode 100644 index 00000000..c11b8ded --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_bf16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef hip_bfloat16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16BF; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp new file mode 100644 index 00000000..377ee707 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f16.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef _Float16 floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_16F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp new file mode 100644 index 00000000..e53cc468 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f32_f32_f32_compute_f32.cpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ + +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF32Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef float ADataType; + typedef float BDataType; + typedef float DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_32F; + constexpr hipDataType typeB = HIP_R_32F; + constexpr hipDataType typeD = HIP_R_32F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp new file mode 100644 index 00000000..fdec48ab --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f32.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF64Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double DDataType; + typedef float floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeD = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_32F; + + return scaleContractionSample(); +} diff --git a/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp new file mode 100644 index 00000000..5eb94c15 --- /dev/null +++ b/samples/01_contraction/simple_scale_contraction_f64_f64_f64_compute_f64.cpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *******************************************************************************/ +#include "simple_scale_contraction.hpp" + +int main(int argc, char* argv[]) +{ + /*************************************** + * Check device support * + **************************************/ + if(!isF64Supported()) + { + std::cout << "unsupported host device" << std::endl; + exit(EXIT_FAILURE); + } + + typedef double ADataType; + typedef double BDataType; + typedef double DDataType; + typedef double floatTypeCompute; + + constexpr hipDataType typeA = HIP_R_64F; + constexpr hipDataType typeB = HIP_R_64F; + constexpr hipDataType typeD = HIP_R_64F; + constexpr hiptensorComputeType_t typeCompute = HIPTENSOR_COMPUTE_64F; + + return scaleContractionSample(); +} diff --git a/test/01_contraction/configs/bilinear_test_params.yaml b/test/01_contraction/configs/bilinear_test_params.yaml index 2bd90e90..f4be1a88 100644 --- a/test/01_contraction/configs/bilinear_test_params.yaml +++ b/test/01_contraction/configs/bilinear_test_params.yaml @@ -1,12 +1,17 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: - - [ HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F] - - [ HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F] + - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - - HIPTENSOR_ALGO_ACTOR_CRITIC + # - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: @@ -24,7 +29,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 1 ] + - [ 24, 18, 2, 4, 9, 2 ] Strides: - [] ... diff --git a/test/01_contraction/configs/scale_test_params.yaml b/test/01_contraction/configs/scale_test_params.yaml index 329f1b84..f4be1a88 100644 --- a/test/01_contraction/configs/scale_test_params.yaml +++ b/test/01_contraction/configs/scale_test_params.yaml @@ -1,12 +1,17 @@ --- Log Level: [ HIPTENSOR_LOG_LEVEL_ERROR, HIPTENSOR_LOG_LEVEL_PERF_TRACE ] Tensor Data Types: + - [ HIP_R_16F, HIP_R_16F, NONE_TYPE, HIP_R_16F, HIP_R_32F ] + - [ HIP_R_16BF, HIP_R_16BF, NONE_TYPE, HIP_R_16BF, HIP_R_32F ] - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_32F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16F ] + - [ HIP_R_32F, HIP_R_32F, NONE_TYPE, HIP_R_32F, HIP_R_16BF ] - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_64F ] + - [ HIP_R_64F, HIP_R_64F, NONE_TYPE, HIP_R_64F, HIP_R_32F ] Algorithm Types: - HIPTENSOR_ALGO_DEFAULT - HIPTENSOR_ALGO_DEFAULT_PATIENT - - HIPTENSOR_ALGO_ACTOR_CRITIC + # - HIPTENSOR_ALGO_ACTOR_CRITIC Operators: - HIPTENSOR_OP_IDENTITY Worksize Prefs: @@ -24,7 +29,7 @@ Betas: Lengths: - [ 5, 6, 3, 4, 3, 4 ] - [ 4, 3, 4, 3, 6, 5 ] - - [ 24, 18, 2, 4, 9, 1 ] + - [ 24, 18, 2, 4, 9, 2 ] Strides: - [] ... diff --git a/test/01_contraction/contraction_test.cpp b/test/01_contraction/contraction_test.cpp index 5d745d12..76cc3033 100644 --- a/test/01_contraction/contraction_test.cpp +++ b/test/01_contraction/contraction_test.cpp @@ -56,7 +56,8 @@ namespace hiptensor // False = skip test bool ContractionTest::checkDevice(hipDataType datatype) const { - return (isF32Supported() && datatype == HIP_R_32F) + return (isF32Supported() + && (datatype == HIP_R_32F || datatype == HIP_R_16F || datatype == HIP_R_16BF)) || (isF64Supported() && datatype == HIP_R_64F); } @@ -115,11 +116,18 @@ namespace hiptensor auto CDataType = testType[2]; auto DDataType = testType[3]; - EXPECT_TRUE((ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)); - EXPECT_TRUE((BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)); - EXPECT_TRUE((CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) + EXPECT_TRUE((ADataType == HIP_R_16F) || (ADataType == HIP_R_16BF) + || (ADataType == HIP_R_32F) || (ADataType == HIP_R_64F)); + EXPECT_TRUE((BDataType == HIP_R_16F) || (BDataType == HIP_R_16BF) + || (BDataType == HIP_R_32F) || (BDataType == HIP_R_64F)); + EXPECT_TRUE((CDataType == HIP_R_16F) || (CDataType == HIP_R_16BF) + || (CDataType == HIP_R_32F) || (CDataType == HIP_R_64F) || (CDataType == NONE_TYPE)); - EXPECT_TRUE((DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); + EXPECT_TRUE((DDataType == HIP_R_16F) || (DDataType == HIP_R_16BF) + || (DDataType == HIP_R_32F) || (DDataType == HIP_R_64F)); + EXPECT_TRUE( + (computeType == HIPTENSOR_COMPUTE_16F) || (computeType == HIPTENSOR_COMPUTE_16BF) + || (computeType == HIPTENSOR_COMPUTE_32F) || (computeType == HIPTENSOR_COMPUTE_64F)); mRunFlag &= checkDevice(DDataType); @@ -228,7 +236,35 @@ namespace hiptensor auto resource = getResource(); resource->resizeStorage(lengths, elementBytes); - if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F) + if(ADataType == HIP_R_16F && BDataType == HIP_R_16F && DDataType == HIP_R_16F) + { + // Initialize matrix data on device + fillLaunchKernel<_Float16>((_Float16*)resource->deviceA().get(), elementsA); + fillLaunchKernel<_Float16>((_Float16*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_R_16F) + { + fillLaunchKernel<_Float16>((_Float16*)resource->deviceC().get(), elementsCD); + } + fillValLaunchKernel<_Float16>((_Float16*)resource->deviceD().get(), + elementsCD, + std::numeric_limits<_Float16>::signaling_NaN()); + } + else if(ADataType == HIP_R_16BF && BDataType == HIP_R_16BF && DDataType == HIP_R_16BF) + { + // Initialize matrix data on device + fillLaunchKernel((hip_bfloat16*)resource->deviceA().get(), elementsA); + fillLaunchKernel((hip_bfloat16*)resource->deviceB().get(), elementsB); + if(CDataType == HIP_R_16BF) + { + fillLaunchKernel((hip_bfloat16*)resource->deviceC().get(), + elementsCD); + } + fillValLaunchKernel( + (hip_bfloat16*)resource->deviceD().get(), + elementsCD, + std::numeric_limits::signaling_NaN()); + } + else if(ADataType == HIP_R_32F && BDataType == HIP_R_32F && DDataType == HIP_R_32F) { // Initialize matrix data on device fillLaunchKernel((float*)resource->deviceA().get(), elementsA); @@ -328,7 +364,7 @@ namespace hiptensor { auto resource = getResource(); - int size = ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double)); + int size = hipDataTypeSize(DDataType); size_t elementsA = std::accumulate(a_ms_ks.mLengths.begin(), a_ms_ks.mLengths.end(), @@ -346,7 +382,50 @@ namespace hiptensor auto D = resource->allocHost(elementsCD * size); resource->copyData(D, resource->deviceD(), elementsCD * size); - if(DDataType == HIP_R_32F) + if(DDataType == HIP_R_16F) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements<_Float16>( + stream, (_Float16*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements<_Float16>(stream, (_Float16*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_R_16BF) + { + stream << "Tensor A elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostA().get(), elementsA); + stream << std::endl; + + stream << "Tensor B elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostB().get(), elementsB); + stream << std::endl; + + stream << "Tensor C elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)resource->hostC().get(), elementsCD); + stream << std::endl; + + stream << "Tensor D elements:\n"; + hiptensorPrintArrayElements( + stream, (hip_bfloat16*)D.get(), elementsCD); + stream << std::endl; + } + else if(DDataType == HIP_R_32F) { stream << "Tensor A elements:\n"; hiptensorPrintArrayElements( @@ -414,6 +493,19 @@ namespace hiptensor auto computeType = convertToComputeType(testType[4]); + /* + * `alpha` and `beta` are void pointer. hiptensor uses readVal to load the value of alpha. + * ``` + * alphaF = hiptensor::readVal( + * alpha, convertToComputeType(HipDataType_v)); + * ``` + * Hence, the `alpha` and `bete` need to point to a ComputeData value + */ + double alphaBuf = 0.; + double betaBuf = 0.; + writeVal(&alphaBuf, computeType, alpha); + writeVal(&betaBuf, computeType, beta); + CHECK_HIPTENSOR_ERROR( hiptensorInitContractionPlan(handle, &plan, &desc, &find, worksize)); @@ -421,20 +513,21 @@ namespace hiptensor CHECK_HIPTENSOR_ERROR(hiptensorContraction(handle, &plan, - (void*)&alpha, + (void*)&alphaBuf, resource->deviceA().get(), resource->deviceB().get(), - (void*)&beta, + (void*)&betaBuf, resource->deviceC().get(), resource->deviceD().get(), workspace, worksize, 0 /* stream */)); - CHECK_HIPTENSOR_ERROR(hiptensorContractionReference((void*)&alpha, + CHECK_HIPTENSOR_ERROR(hiptensorContractionReference(&plan, + (void*)&alphaBuf, resource->hostA().get(), resource->hostB().get(), - (void*)&beta, + (void*)&betaBuf, resource->hostC().get(), resource->hostD().get(), a_ms_ks.mLengths, @@ -456,11 +549,24 @@ namespace hiptensor size_t{1}, std::multiplies()); - int sizeD = elementsCD * ((DDataType == HIP_R_32F) ? sizeof(float) : sizeof(double)); + int sizeD = elementsCD * hipDataTypeSize(DDataType); auto reference = resource->allocDevice(sizeD); resource->copyData(reference, resource->hostD(), sizeD); - if(DDataType == HIP_R_32F) + if(DDataType == HIP_R_16F) + { + std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel<_Float16>( + (_Float16*)resource->deviceD().get(), (_Float16*)reference.get(), elementsCD); + } + else if(DDataType == HIP_R_16BF) + { + std::tie(mValidationResult, mMaxRelativeError) + = compareEqualLaunchKernel( + (hip_bfloat16*)resource->deviceD().get(), + (hip_bfloat16*)reference.get(), + elementsCD); + } + else if(DDataType == HIP_R_32F) { std::tie(mValidationResult, mMaxRelativeError) = compareEqualLaunchKernel( (float*)resource->deviceD().get(), (float*)reference.get(), elementsCD); diff --git a/test/device/common.hpp b/test/device/common.hpp index f961abc1..172e6953 100644 --- a/test/device/common.hpp +++ b/test/device/common.hpp @@ -72,7 +72,7 @@ __global__ void fillKernel(DataType* data, uint32_t elementSize, uint32_t seed) if(index < elementSize) { - auto value = (DataType(index / DataType(RAND_MAX) - 0.5) * 100) / elementSize; + auto value = (DataType(index / double(RAND_MAX) - 0.5) * 100) / elementSize; data[index] = static_cast(value); } } diff --git a/test/llvm/yaml_parser_config.cpp b/test/llvm/yaml_parser_config.cpp index 46f4c43e..cd3eb46f 100644 --- a/test/llvm/yaml_parser_config.cpp +++ b/test/llvm/yaml_parser_config.cpp @@ -110,6 +110,7 @@ namespace llvm static void enumeration(IO& io, hipDataType& value) { io.enumCase(value, "HIP_R_16F", HIP_R_16F); + io.enumCase(value, "HIP_R_16BF", HIP_R_16BF); io.enumCase(value, "HIP_R_32F", HIP_R_32F); io.enumCase(value, "HIP_R_64F", HIP_R_64F); io.enumCase(value, "NONE_TYPE", hiptensor::NONE_TYPE); diff --git a/test/utils.hpp b/test/utils.hpp index 1f7ece44..ad4bb565 100644 --- a/test/utils.hpp +++ b/test/utils.hpp @@ -41,9 +41,9 @@ #include #include #include +#include #include "device/common.hpp" -#include "types.hpp" #define HIPTENSOR_FREE_DEVICE(ptr) \ if(ptr != nullptr) \