diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f19a341001732..cc6d54aaa74ff 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -50,8 +50,9 @@ sycl/plugins/**/hip/ @intel/llvm-reviewers-cuda # CUDA specific runtime implementations sycl/include/sycl/ext/oneapi/experimental/cuda/ @intel/llvm-reviewers-cuda -# CUDA device code tests +# CUDA and HIP device code tests sycl/test/check_device_code/cuda/ @intel/llvm-reviewers-cuda +sycl/test/check_device_code/hip/ @intel/llvm-reviewers-cuda # XPTI instrumentation utilities xpti/ @intel/llvm-reviewers-runtime diff --git a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp index 1b9eb568ad517..faad677343611 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp @@ -482,6 +482,338 @@ struct matrix_params< template using joint_matrix_d = joint_matrix; }; + +////////////////////////////////////////////// +/// AMD Matrix Cores - GFX90A architecture /// +////////////////////////////////////////////// + +template +constexpr bool is_combination_valid_amd_gfx90a(size_t sM, size_t sN, + size_t sK) { + return (std::is_same_v && std::is_same_v && + ((sM == 32 && sN == 32 && sK == 8) || + (sM == 16 && sN == 16 && sK == 16))) || + (std::is_same_v && std::is_same_v && + ((sM == 32 && sN == 32 && sK == 8) || + (sM == 16 && sN == 16 && sK == 16))) || + (std::is_same_v && std::is_same_v && + ((sM == 32 && sN == 32 && sK == 8) || + (sM == 16 && sN == 16 && sK == 16))) || + (std::is_same_v && std::is_same_v && + (sM == 16 && sN == 16 && sK == 4)); +} + +template +constexpr bool are_types_valid_amd_gfx90a() { + return (std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v) || + (std::is_same_v && std::is_same_v); +} + +// Default-values query: +// Specialization for when only types are given, need to query only sizes +template +struct matrix_params< + architecture::amd_gpu_gfx90a, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if_t<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + std::is_same_v && std::is_same_v)>> { + static_assert( + are_types_valid_amd_gfx90a(), + "Invalid types for AMD gfx90a, supported types are half, float, " + "int8_t, int32_t, double and bfloat16 "); + + // Default sizes for AMD gfx90a were chosen to represent a square matrix + static constexpr std::size_t M = 16; + static constexpr std::size_t N = 16; + static constexpr std::size_t K = ((sizeof(Ta) == 8) ? 16 : 4); + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Validation query +// Specialization when both types and sizes are given +template +struct matrix_params< + architecture::amd_gpu_gfx90a, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if_t<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + std::is_same_v && std::is_same_v && sM != 0 && + sN != 0 && sK != 0)>> { + static_assert( + is_combination_valid_amd_gfx90a(sM, sN, sK), + "Invalid parameters for AMD gfx90a, query valid combinations " + "using: " + "q.get_device().get_info()"); + + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +///////////////////////////////////////////////// +/// CUDA Tensor Cores - sm70, sm72 and sm80 /// +///////////////////////////////////////////////// + +template +constexpr bool are_types_valid_cuda_sm70() { + return (std::is_same_v && std::is_same_v && + std::is_same_v) || + (std::is_same_v && std::is_same_v && + std::is_same_v) || + (std::is_same_v && std::is_same_v && + std::is_same_v) || + (std::is_same_v && std::is_same_v && + std::is_same_v); +} + +template +constexpr bool are_types_valid_cuda_sm72() { + return (std::is_same_v && std::is_same_v && + std::is_same_v) || + (std::is_same_v && std::is_same_v && + std::is_same_v); +} + +template +constexpr bool are_types_valid_cuda_sm80() { + return (std::is_same_v && std::is_same_v && + std::is_same_v) || + (std::is_same_v && std::is_same_v && + std::is_same_v) || + (std::is_same_v && std::is_same_v && + std::is_same_v); +} + +template +constexpr bool is_combination_valid_cuda_sm70(size_t sM, size_t sN, size_t sK) { + return are_types_valid_cuda_sm70() && + ((sM == 8 && sN == 32 && sK == 16) || + (sM == 16 && sN == 16 && sK == 16) || + (sM == 32 && sN == 8 && sK == 16)); +} + +template +constexpr bool is_combination_valid_cuda_sm72(size_t sM, size_t sN, size_t sK) { + return are_types_valid_cuda_sm72() && + ((sM == 8 && sN == 32 && sK == 16) || + (sM == 16 && sN == 16 && sK == 16) || + (sM == 32 && sN == 8 && sK == 16)); +} + +template +constexpr bool is_combination_valid_cuda_sm80(size_t sM, size_t sN, size_t sK) { + return ((std::is_same_v && std::is_same_v && + std::is_same_v)&&(sM == 16 && sN == 16 && sK == 8)) || + ((std::is_same_v && std::is_same_v && + std::is_same_v)&&((sM == 16 && sN == 16 && sK == 16) || + (sM == 8 && sN == 32 && sK == 16) || + (sM == 32 && sN == 8 && sK == 16))) || + ((std::is_same_v && std::is_same_v && + std::is_same_v)&&(sM == 8 && sN == 8 && sK == 4)); +} + +// Default-values query (nvidia sm70): +// Specialization for when only types are given, need to query only sizes +template +struct matrix_params< + architecture::nvidia_gpu_sm_70, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if_t<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + std::is_same_v)>> { + static_assert( + are_types_valid_cuda_sm70(), + "Invalid types for nvidia sm70, supported types are half and float "); + + // Default sizes for nvidia sm70 were chosen to represent a square matrix + static constexpr std::size_t M = 16; + static constexpr std::size_t N = 16; + static constexpr std::size_t K = 16; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Default-values query (nvidia sm72): +// Specialization for when only types are given, need to query only sizes +template +struct matrix_params< + architecture::nvidia_gpu_sm_72, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + std::is_same_v)>::type> { + static_assert( + are_types_valid_cuda_sm70() || + are_types_valid_cuda_sm72(), + "Invalid types for nvidia sm72, supported types are half, float " + "int8_t, uint8_t and int32_t "); + + static constexpr std::size_t M = 16; + static constexpr std::size_t N = 16; + static constexpr std::size_t K = 16; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Default-values query (nvidia sm80): +// Specialization for when only types are given, need to query only sizes +template +struct matrix_params< + architecture::nvidia_gpu_sm_80, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if_t<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + std::is_same_v)>> { + static_assert( + are_types_valid_cuda_sm70() || + are_types_valid_cuda_sm72() || + are_types_valid_cuda_sm80(), + "Invalid types for nvidia sm80, supported types are half, float " + "int8_t, uint8_t, int32_t, double, tf32 and bfloat16 "); + + static constexpr std::size_t M = (sizeof(Ta) == 8) ? 8 : 16; + static constexpr std::size_t N = (sizeof(Ta) == 8) ? 8 : 16; + static constexpr std::size_t K = + std::is_same_v ? 8 : (sizeof(Ta) == 8 ? 4 : 16); + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Validation query (nvidia sm70) +// Specialization when both types and sizes are given +template +struct matrix_params< + architecture::nvidia_gpu_sm_70, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if_t<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + std::is_same_v && sM != 0 && sN != 0 && sK != 0)>> { + static_assert( + is_combination_valid_cuda_sm70(sM, sN, sK), + "Invalid parameters for nvidia sm70, query valid combinations " + "using: " + "q.get_device().get_info()"); + + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Validation query (nvidia sm72) +// Specialization when both types and sizes are given +template +struct matrix_params< + architecture::nvidia_gpu_sm_72, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if_t<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + std::is_same_v && sM != 0 && sN != 0 && sK != 0)>> { + static_assert( + is_combination_valid_cuda_sm70(sM, sN, sK) || + is_combination_valid_cuda_sm72(sM, sN, sK), + "Invalid parameters for nvidia sm72, query valid combinations " + "using: " + "q.get_device().get_info()"); + + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Validation query (nvidia sm80) +// Specialization when both types and sizes are given +template +struct matrix_params< + architecture::nvidia_gpu_sm_80, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if_t<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && !std::is_same_v && + std::is_same_v && sM != 0 && sN != 0 && sK != 0)>> { + static_assert( + is_combination_valid_cuda_sm70(sM, sN, sK) || + is_combination_valid_cuda_sm72(sM, sN, sK) || + is_combination_valid_cuda_sm80(sM, sN, sK), + "Invalid parameters for nvidia sm80, query valid combinations " + "using: " + "q.get_device().get_info()"); + + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + } // namespace experimental::matrix } // namespace oneapi } // namespace ext diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index bd2cc473d24b3..b065799ba7cd8 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -738,6 +738,7 @@ struct get_device_info_impl< get(const DeviceImplPtr &Dev) { using namespace ext::oneapi::experimental::matrix; using namespace ext::oneapi::experimental; + backend CurrentBackend = Dev->getBackend(); architecture DeviceArch = get_device_info_impl< ext::oneapi::experimental::architecture, ext::oneapi::experimental::info::device::architecture>::get(Dev); @@ -807,6 +808,118 @@ struct get_device_info_impl< {8, 0, 0, 0, 8, 16, matrix_type::bf16, matrix_type::bf16, matrix_type::fp32, matrix_type::fp32}, }; + else if (architecture::amd_gpu_gfx90a == DeviceArch) + return { + {0, 0, 0, 32, 32, 8, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 32, 8, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 16, 16, 16, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 32, 32, 8, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 4, matrix_type::fp64, matrix_type::fp64, + matrix_type::fp64, matrix_type::fp64}, + }; + else if (backend::ext_oneapi_cuda == CurrentBackend) { + // TODO: Tho following can be simplified when comparison of architectures + // using < and > will be implemented + using oneapi_exp_arch = sycl::ext::oneapi::experimental::architecture; + constexpr std::pair NvidiaArchNumbs[] = { + {5.0, oneapi_exp_arch::nvidia_gpu_sm_50}, + {5.2, oneapi_exp_arch::nvidia_gpu_sm_52}, + {5.3, oneapi_exp_arch::nvidia_gpu_sm_53}, + {6.0, oneapi_exp_arch::nvidia_gpu_sm_60}, + {6.1, oneapi_exp_arch::nvidia_gpu_sm_61}, + {6.2, oneapi_exp_arch::nvidia_gpu_sm_62}, + {7.0, oneapi_exp_arch::nvidia_gpu_sm_70}, + {7.2, oneapi_exp_arch::nvidia_gpu_sm_72}, + {7.5, oneapi_exp_arch::nvidia_gpu_sm_75}, + {8.0, oneapi_exp_arch::nvidia_gpu_sm_80}, + {8.6, oneapi_exp_arch::nvidia_gpu_sm_86}, + {8.7, oneapi_exp_arch::nvidia_gpu_sm_87}, + {8.9, oneapi_exp_arch::nvidia_gpu_sm_89}, + {9.0, oneapi_exp_arch::nvidia_gpu_sm_90}, + }; + auto GetArchNum = [&](const architecture &arch) { + for (const auto &Item : NvidiaArchNumbs) + if (Item.second == arch) + return Item.first; + throw sycl::exception( + make_error_code(errc::runtime), + "The current device architecture is not supported by " + "sycl_ext_oneapi_matrix."); + }; + float ComputeCapability = GetArchNum(DeviceArch); + std::vector sm_70_combinations = { + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}}; + std::vector sm_72_combinations = { + {0, 0, 0, 16, 16, 16, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 8, 32, 16, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 32, 8, 16, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 16, 16, 16, matrix_type::uint8, matrix_type::uint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 8, 32, 16, matrix_type::uint8, matrix_type::uint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 32, 8, 16, matrix_type::uint8, matrix_type::uint8, + matrix_type::sint32, matrix_type::sint32}}; + std::vector sm_80_combinations = { + {0, 0, 0, 16, 16, 8, matrix_type::tf32, matrix_type::tf32, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 8, 32, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 8, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 8, 8, 4, matrix_type::fp64, matrix_type::fp64, + matrix_type::fp64, matrix_type::fp64}}; + if (ComputeCapability >= 8.0) { + sm_80_combinations.insert(sm_80_combinations.end(), + sm_72_combinations.begin(), + sm_72_combinations.end()); + sm_80_combinations.insert(sm_80_combinations.end(), + sm_70_combinations.begin(), + sm_70_combinations.end()); + return sm_80_combinations; + } else if (ComputeCapability >= 7.2) { + sm_72_combinations.insert(sm_72_combinations.end(), + sm_70_combinations.begin(), + sm_70_combinations.end()); + return sm_72_combinations; + } else if (ComputeCapability >= 7.0) + return sm_70_combinations; + } return {}; } }; diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index b121ed9a28a84..872fa40c16a08 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -2719,11 +2719,7 @@ checkDevSupportDeviceRequirements(const device &Dev, } } - // TODO: remove checks for CUDA and HIP from if-statement below when runtime - // query for them in matrix_combinations is implemented - if (JointMatrixPropIt && - (Dev.get_backend() != sycl::backend::ext_oneapi_cuda) && - (Dev.get_backend() != sycl::backend::ext_oneapi_hip)) { + if (JointMatrixPropIt) { std::vector Combinations = Dev.get_info< ext::oneapi::experimental::info::device::matrix_combinations>(); @@ -2747,11 +2743,7 @@ checkDevSupportDeviceRequirements(const device &Dev, return Result.value(); } - // TODO: remove checks for CUDA and HIP from if-statement below when runtime - // query for them in matrix_combinations is implemented - if (JointMatrixMadPropIt && - (Dev.get_backend() != sycl::backend::ext_oneapi_cuda) && - (Dev.get_backend() != sycl::backend::ext_oneapi_hip)) { + if (JointMatrixMadPropIt) { std::vector Combinations = Dev.get_info< ext::oneapi::experimental::info::device::matrix_combinations>(); diff --git a/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp b/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp new file mode 100644 index 0000000000000..2eef5ee1ef933 --- /dev/null +++ b/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp @@ -0,0 +1,58 @@ +// REQUIRES: gpu-amd-gfx90a +// RUN: %{build} -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx90a -o %t.out +// RUN: %{run} %t.out + +#include + +using namespace sycl::ext::oneapi::experimental::matrix; + +bool find_combination(const combination &comb, + const std::vector &expected_combinations) { + return std::find_if(expected_combinations.begin(), + expected_combinations.end(), + [&comb](const auto &expected_comb) { + return (comb.max_msize == expected_comb.max_msize && + comb.max_nsize == expected_comb.max_nsize && + comb.max_ksize == expected_comb.max_ksize && + comb.msize == expected_comb.msize && + comb.nsize == expected_comb.nsize && + comb.ksize == expected_comb.ksize && + comb.atype == expected_comb.atype && + comb.btype == expected_comb.btype && + comb.ctype == expected_comb.ctype && + comb.dtype == expected_comb.dtype); + }) != expected_combinations.end(); +} + +int main() { + std::vector expected_combinations = { + {0, 0, 0, 32, 32, 8, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 32, 8, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 16, 16, 16, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 32, 32, 8, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 4, matrix_type::fp64, matrix_type::fp64, + matrix_type::fp64, matrix_type::fp64}}; + + sycl::queue q; + std::vector actual_combinations = + q.get_device() + .get_info(); + + assert(actual_combinations.size() == expected_combinations.size() && + "Number of combinations is not equal."); + + for (auto &comb : actual_combinations) { + assert(find_combination(comb, expected_combinations) && + "Some values in matrix runtime query for gfx90a are not expected."); + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/runtime_query_tensorcores.cpp b/sycl/test-e2e/Matrix/runtime_query_tensorcores.cpp new file mode 100644 index 0000000000000..e69c512ac400e --- /dev/null +++ b/sycl/test-e2e/Matrix/runtime_query_tensorcores.cpp @@ -0,0 +1,120 @@ +// REQUIRES: cuda +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include + +using namespace sycl::ext::oneapi::experimental::matrix; + +bool find_combination(const combination &comb, + const std::vector &expected_combinations) { + return std::find_if(expected_combinations.begin(), + expected_combinations.end(), + [&comb](const auto &expected_comb) { + return (comb.max_msize == expected_comb.max_msize && + comb.max_nsize == expected_comb.max_nsize && + comb.max_ksize == expected_comb.max_ksize && + comb.msize == expected_comb.msize && + comb.nsize == expected_comb.nsize && + comb.ksize == expected_comb.ksize && + comb.atype == expected_comb.atype && + comb.btype == expected_comb.btype && + comb.ctype == expected_comb.ctype && + comb.dtype == expected_comb.dtype); + }) != expected_combinations.end(); +} + +int main() { + + sycl::queue Q; + auto ComputeCapability = + std::stof(Q.get_device().get_info()); + + std::vector sm_70_combinations = { + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp16}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp32, matrix_type::fp16}, + {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}, + {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16, + matrix_type::fp16, matrix_type::fp32}}; + + std::vector sm_72_combinations = { + {0, 0, 0, 16, 16, 16, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 8, 32, 16, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 32, 8, 16, matrix_type::sint8, matrix_type::sint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 16, 16, 16, matrix_type::uint8, matrix_type::uint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 8, 32, 16, matrix_type::uint8, matrix_type::uint8, + matrix_type::sint32, matrix_type::sint32}, + {0, 0, 0, 32, 8, 16, matrix_type::uint8, matrix_type::uint8, + matrix_type::sint32, matrix_type::sint32}}; + + std::vector sm_80_combinations = { + {0, 0, 0, 16, 16, 8, matrix_type::tf32, matrix_type::tf32, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 8, 32, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 32, 8, 16, matrix_type::bf16, matrix_type::bf16, + matrix_type::fp32, matrix_type::fp32}, + {0, 0, 0, 8, 8, 4, matrix_type::fp64, matrix_type::fp64, + matrix_type::fp64, matrix_type::fp64}}; + + std::vector expected_combinations; + + if (ComputeCapability >= 8.0) { + std::move(sm_70_combinations.begin(), sm_70_combinations.end(), + std::back_inserter(expected_combinations)); + std::move(sm_72_combinations.begin(), sm_72_combinations.end(), + std::back_inserter(expected_combinations)); + std::move(sm_80_combinations.begin(), sm_80_combinations.end(), + std::back_inserter(expected_combinations)); + } else if (ComputeCapability >= 7.2) { + std::move(sm_70_combinations.begin(), sm_70_combinations.end(), + std::back_inserter(expected_combinations)); + std::move(sm_72_combinations.begin(), sm_72_combinations.end(), + std::back_inserter(expected_combinations)); + } else if (ComputeCapability >= 7.0) { + std::move(sm_70_combinations.begin(), sm_70_combinations.end(), + std::back_inserter(expected_combinations)); + } else { + return 0; + } + + std::vector actual_combinations = + Q.get_device() + .get_info(); + + assert(actual_combinations.size() == expected_combinations.size() && + "Number of combinations is not equal."); + + for (auto &comb : actual_combinations) { + assert(find_combination(comb, expected_combinations) && + "Some values in matrix runtime query for CUDA are not expected."); + } + + return 0; +} diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-compile-query-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-compile-query-test.cpp new file mode 100644 index 0000000000000..56dfbf173fafc --- /dev/null +++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-compile-query-test.cpp @@ -0,0 +1,33 @@ +// REQUIRES: cuda +// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s -o compile-query-cuda + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental; +using namespace sycl::ext::oneapi::experimental::matrix; + +int main() { + // Compile-time query to validate the matrix parameters + using myparams = matrix_params; + + size_t dmsize = myparams::M; + size_t dnsize = myparams::N; + size_t dksize = myparams::K; + std::cout + << "sizes of Nvidia gpu sm70 matrix_params chosen by the user are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; + + // Sizes-only compile-time query: types are given, generate default sizes + using myparams2 = + matrix_params; + myparams2 p; + dmsize = myparams2::M; + dnsize = myparams2::N; + dksize = myparams2::K; + std::cout << "default Nvidia gpu sm70 sizes matrix_params are: M " << dmsize + << " N " << dnsize << " K " << dksize << std::endl; + return 0; +} diff --git a/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp b/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp new file mode 100644 index 0000000000000..4337cff1fda2b --- /dev/null +++ b/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp @@ -0,0 +1,33 @@ +// REQUIRES: hip +// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o compile-query-hip + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental; +using namespace sycl::ext::oneapi::experimental::matrix; + +int main() { + // Compile-time query to validate the matrix parameters + using myparams = matrix_params; + + size_t dmsize = myparams::M; + size_t dnsize = myparams::N; + size_t dksize = myparams::K; + std::cout + << "sizes of AMD gpu gfx90a matrix_params chosen by the user are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; + + // Sizes-only compile-time query: types are given, generate default sizes + using myparams2 = matrix_params; + myparams2 p; + dmsize = myparams2::M; + dnsize = myparams2::N; + dksize = myparams2::K; + std::cout << "default AMD gpu gfx90a sizes matrix_params are: M " << dmsize + << " N " << dnsize << " K " << dksize << std::endl; + return 0; +};