diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f19a341001732..cc6d54aaa74ff 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -50,8 +50,9 @@ sycl/plugins/**/hip/ @intel/llvm-reviewers-cuda
 # CUDA specific runtime implementations
 sycl/include/sycl/ext/oneapi/experimental/cuda/ @intel/llvm-reviewers-cuda
 
-# CUDA device code tests
+# CUDA and HIP device code tests
 sycl/test/check_device_code/cuda/ @intel/llvm-reviewers-cuda
+sycl/test/check_device_code/hip/ @intel/llvm-reviewers-cuda
 
 # XPTI instrumentation utilities
 xpti/ @intel/llvm-reviewers-runtime
diff --git a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp
index 1b9eb568ad517..faad677343611 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp
@@ -482,6 +482,338 @@ struct matrix_params<
   template <typename Group>
   using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
 };
+
+//////////////////////////////////////////////
+/// AMD Matrix Cores - GFX90A architecture ///
+//////////////////////////////////////////////
+
+template <typename Ta, typename Tc>
+constexpr bool is_combination_valid_amd_gfx90a(size_t sM, size_t sN,
+                                               size_t sK) {
+  return (std::is_same_v<Ta, half> && std::is_same_v<Tc, float> &&
+          ((sM == 32 && sN == 32 && sK == 8) ||
+           (sM == 16 && sN == 16 && sK == 16))) ||
+         (std::is_same_v<Ta, int8_t> && std::is_same_v<Tc, int32_t> &&
+          ((sM == 32 && sN == 32 && sK == 8) ||
+           (sM == 16 && sN == 16 && sK == 16))) ||
+         (std::is_same_v<Ta, bfloat16> && std::is_same_v<Tc, float> &&
+          ((sM == 32 && sN == 32 && sK == 8) ||
+           (sM == 16 && sN == 16 && sK == 16))) ||
+         (std::is_same_v<Ta, double> && std::is_same_v<Tc, double> &&
+          (sM == 16 && sN == 16 && sK == 4));
+}
+
+template <typename Ta, typename Tc>
+constexpr bool are_types_valid_amd_gfx90a() {
+  return (std::is_same_v<Ta, half> && std::is_same_v<Tc, float>) ||
+         (std::is_same_v<Ta, int8_t> && std::is_same_v<Tc, int32_t>) ||
+         (std::is_same_v<Ta, bfloat16> && std::is_same_v<Tc, float>) ||
+         (std::is_same_v<Ta, double> && std::is_same_v<Tc, double>);
+}
+
+// Default-values query:
+// Specialization for when only types are given, need to query only sizes
+template <typename Ta, typename Tb, typename Tc, typename Td>
+struct matrix_params<
+    architecture::amd_gpu_gfx90a, Ta, Tb, Tc, Td, 0, 0, 0,
+    typename std::enable_if_t<(
+        !std::is_same_v<Ta, void> && !std::is_same_v<Tb, void> &&
+        !std::is_same_v<Tc, void> && !std::is_same_v<Td, void> &&
+        std::is_same_v<Ta, Tb> && std::is_same_v<Tc, Td>)>> {
+  static_assert(
+      are_types_valid_amd_gfx90a<Ta, Tc>(),
+      "Invalid types for AMD gfx90a, supported types are half, float, "
+      "int8_t, int32_t, double and bfloat16 ");
+
+  // Default sizes for AMD gfx90a were chosen to represent a square matrix
+  static constexpr std::size_t M = 16;
+  static constexpr std::size_t N = 16;
+  static constexpr std::size_t K = ((sizeof(Ta) == 8) ? 16 : 4);
+
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
+  template <typename Group>
+  using joint_matrix_c = joint_matrix<Group, Tc, use::accumulator, M, N>;
+  template <typename Group>
+  using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
+};
+
+// Validation query
+// Specialization when both types and sizes are given
+template <typename Ta, typename Tb, typename Tc, typename Td, size_t sM,
+          size_t sN, size_t sK>
+struct matrix_params<
+    architecture::amd_gpu_gfx90a, Ta, Tb, Tc, Td, sM, sN, sK,
+    typename std::enable_if_t<(
+        !std::is_same_v<Ta, void> && !std::is_same_v<Tb, void> &&
+        !std::is_same_v<Tc, void> && !std::is_same_v<Td, void> &&
+        std::is_same_v<Ta, Tb> && std::is_same_v<Tc, Td> && sM != 0 &&
+        sN != 0 && sK != 0)>> {
+  static_assert(
+      is_combination_valid_amd_gfx90a<Ta, Tc>(sM, sN, sK),
+      "Invalid parameters for AMD gfx90a, query valid combinations "
+      "using: "
+      "q.get_device().get_info<sycl::info::device::matrix::combinations>()");
+
+  static constexpr std::size_t M = sM;
+  static constexpr std::size_t N = sN;
+  static constexpr std::size_t K = sK;
+
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
+  template <typename Group>
+  using joint_matrix_c = joint_matrix<Group, Tc, use::accumulator, M, N>;
+  template <typename Group>
+  using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
+};
+
+/////////////////////////////////////////////////
+/// CUDA Tensor Cores - sm70, sm72 and sm80 ///
+/////////////////////////////////////////////////
+
+template <typename Ta, typename Tc, typename Td>
+constexpr bool are_types_valid_cuda_sm70() {
+  return (std::is_same_v<Ta, half> && std::is_same_v<Tc, float> &&
+          std::is_same_v<Td, float>) ||
+         (std::is_same_v<Ta, half> && std::is_same_v<Tc, half> &&
+          std::is_same_v<Td, half>) ||
+         (std::is_same_v<Ta, half> && std::is_same_v<Tc, float> &&
+          std::is_same_v<Td, half>) ||
+         (std::is_same_v<Ta, half> && std::is_same_v<Tc, half> &&
+          std::is_same_v<Td, float>);
+}
+
+template <typename Ta, typename Tc, typename Td>
+constexpr bool are_types_valid_cuda_sm72() {
+  return (std::is_same_v<Ta, int8_t> && std::is_same_v<Tc, int32_t> &&
+          std::is_same_v<Td, int32_t>) ||
+         (std::is_same_v<Ta, uint8_t> && std::is_same_v<Tc, int32_t> &&
+          std::is_same_v<Td, int32_t>);
+}
+
+template <typename Ta, typename Tc, typename Td>
+constexpr bool are_types_valid_cuda_sm80() {
+  return (std::is_same_v<Ta, precision::tf32> && std::is_same_v<Tc, float> &&
+          std::is_same_v<Td, float>) ||
+         (std::is_same_v<Ta, bfloat16> && std::is_same_v<Tc, float> &&
+          std::is_same_v<Td, float>) ||
+         (std::is_same_v<Ta, double> && std::is_same_v<Tc, double> &&
+          std::is_same_v<Td, double>);
+}
+
+template <typename Ta, typename Tc, typename Td>
+constexpr bool is_combination_valid_cuda_sm70(size_t sM, size_t sN, size_t sK) {
+  return are_types_valid_cuda_sm70<Ta, Tc, Td>() &&
+         ((sM == 8 && sN == 32 && sK == 16) ||
+          (sM == 16 && sN == 16 && sK == 16) ||
+          (sM == 32 && sN == 8 && sK == 16));
+}
+
+template <typename Ta, typename Tc, typename Td>
+constexpr bool is_combination_valid_cuda_sm72(size_t sM, size_t sN, size_t sK) {
+  return are_types_valid_cuda_sm72<Ta, Tc, Td>() &&
+         ((sM == 8 && sN == 32 && sK == 16) ||
+          (sM == 16 && sN == 16 && sK == 16) ||
+          (sM == 32 && sN == 8 && sK == 16));
+}
+
+template <typename Ta, typename Tc, typename Td>
+constexpr bool is_combination_valid_cuda_sm80(size_t sM, size_t sN, size_t sK) {
+  return ((std::is_same_v<Ta, precision::tf32> && std::is_same_v<Tc, float> &&
+           std::is_same_v<Td, float>)&&(sM == 16 && sN == 16 && sK == 8)) ||
+         ((std::is_same_v<Ta, bfloat16> && std::is_same_v<Tc, float> &&
+           std::is_same_v<Td, float>)&&((sM == 16 && sN == 16 && sK == 16) ||
+                                        (sM == 8 && sN == 32 && sK == 16) ||
+                                        (sM == 32 && sN == 8 && sK == 16))) ||
+         ((std::is_same_v<Ta, double> && std::is_same_v<Tc, double> &&
+           std::is_same_v<Td, double>)&&(sM == 8 && sN == 8 && sK == 4));
+}
+
+// Default-values query (nvidia sm70):
+// Specialization for when only types are given, need to query only sizes
+template <typename Ta, typename Tb, typename Tc, typename Td>
+struct matrix_params<
+    architecture::nvidia_gpu_sm_70, Ta, Tb, Tc, Td, 0, 0, 0,
+    typename std::enable_if_t<(
+        !std::is_same_v<Ta, void> && !std::is_same_v<Tb, void> &&
+        !std::is_same_v<Tc, void> && !std::is_same_v<Td, void> &&
+        std::is_same_v<Ta, Tb>)>> {
+  static_assert(
+      are_types_valid_cuda_sm70<Ta, Tc, Td>(),
+      "Invalid types for nvidia sm70, supported types are half and float ");
+
+  // Default sizes for nvidia sm70 were chosen to represent a square matrix
+  static constexpr std::size_t M = 16;
+  static constexpr std::size_t N = 16;
+  static constexpr std::size_t K = 16;
+
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
+  template <typename Group>
+  using joint_matrix_c = joint_matrix<Group, Tc, use::accumulator, M, N>;
+  template <typename Group>
+  using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
+};
+
+// Default-values query (nvidia sm72):
+// Specialization for when only types are given, need to query only sizes
+template <typename Ta, typename Tb, typename Tc, typename Td>
+struct matrix_params<
+    architecture::nvidia_gpu_sm_72, Ta, Tb, Tc, Td, 0, 0, 0,
+    typename std::enable_if<(
+        !std::is_same_v<Ta, void> && !std::is_same_v<Tb, void> &&
+        !std::is_same_v<Tc, void> && !std::is_same_v<Td, void> &&
+        std::is_same_v<Ta, Tb>)>::type> {
+  static_assert(
+      are_types_valid_cuda_sm70<Ta, Tc, Td>() ||
+          are_types_valid_cuda_sm72<Ta, Tc, Td>(),
+      "Invalid types for nvidia sm72, supported types are half, float "
+      "int8_t, uint8_t and int32_t ");
+
+  static constexpr std::size_t M = 16;
+  static constexpr std::size_t N = 16;
+  static constexpr std::size_t K = 16;
+
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
+  template <typename Group>
+  using joint_matrix_c = joint_matrix<Group, Tc, use::accumulator, M, N>;
+  template <typename Group>
+  using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
+};
+
+// Default-values query (nvidia sm80):
+// Specialization for when only types are given, need to query only sizes
+template <typename Ta, typename Tb, typename Tc, typename Td>
+struct matrix_params<
+    architecture::nvidia_gpu_sm_80, Ta, Tb, Tc, Td, 0, 0, 0,
+    typename std::enable_if_t<(
+        !std::is_same_v<Ta, void> && !std::is_same_v<Tb, void> &&
+        !std::is_same_v<Tc, void> && !std::is_same_v<Td, void> &&
+        std::is_same_v<Ta, Tb>)>> {
+  static_assert(
+      are_types_valid_cuda_sm70<Ta, Tc, Td>() ||
+          are_types_valid_cuda_sm72<Ta, Tc, Td>() ||
+          are_types_valid_cuda_sm80<Ta, Tc, Td>(),
+      "Invalid types for nvidia sm80, supported types are half, float "
+      "int8_t, uint8_t, int32_t, double, tf32 and bfloat16 ");
+
+  static constexpr std::size_t M = (sizeof(Ta) == 8) ? 8 : 16;
+  static constexpr std::size_t N = (sizeof(Ta) == 8) ? 8 : 16;
+  static constexpr std::size_t K =
+      std::is_same_v<Ta, precision::tf32> ? 8 : (sizeof(Ta) == 8 ? 4 : 16);
+
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
+  template <typename Group>
+  using joint_matrix_c = joint_matrix<Group, Tc, use::accumulator, M, N>;
+  template <typename Group>
+  using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
+};
+
+// Validation query (nvidia sm70)
+// Specialization when both types and sizes are given
+template <typename Ta, typename Tb, typename Tc, typename Td, size_t sM,
+          size_t sN, size_t sK>
+struct matrix_params<
+    architecture::nvidia_gpu_sm_70, Ta, Tb, Tc, Td, sM, sN, sK,
+    typename std::enable_if_t<(
+        !std::is_same_v<Ta, void> && !std::is_same_v<Tb, void> &&
+        !std::is_same_v<Tc, void> && !std::is_same_v<Td, void> &&
+        std::is_same_v<Ta, Tb> && sM != 0 && sN != 0 && sK != 0)>> {
+  static_assert(
+      is_combination_valid_cuda_sm70<Ta, Tc, Td>(sM, sN, sK),
+      "Invalid parameters for nvidia sm70, query valid combinations "
+      "using: "
+      "q.get_device().get_info<sycl::info::device::matrix::combinations>()");
+
+  static constexpr std::size_t M = sM;
+  static constexpr std::size_t N = sN;
+  static constexpr std::size_t K = sK;
+
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
+  template <typename Group>
+  using joint_matrix_c = joint_matrix<Group, Tc, use::accumulator, M, N>;
+  template <typename Group>
+  using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
+};
+
+// Validation query (nvidia sm72)
+// Specialization when both types and sizes are given
+template <typename Ta, typename Tb, typename Tc, typename Td, size_t sM,
+          size_t sN, size_t sK>
+struct matrix_params<
+    architecture::nvidia_gpu_sm_72, Ta, Tb, Tc, Td, sM, sN, sK,
+    typename std::enable_if_t<(
+        !std::is_same_v<Ta, void> && !std::is_same_v<Tb, void> &&
+        !std::is_same_v<Tc, void> && !std::is_same_v<Td, void> &&
+        std::is_same_v<Ta, Tb> && sM != 0 && sN != 0 && sK != 0)>> {
+  static_assert(
+      is_combination_valid_cuda_sm70<Ta, Tc, Td>(sM, sN, sK) ||
+          is_combination_valid_cuda_sm72<Ta, Tc, Td>(sM, sN, sK),
+      "Invalid parameters for nvidia sm72, query valid combinations "
+      "using: "
+      "q.get_device().get_info<sycl::info::device::matrix::combinations>()");
+
+  static constexpr std::size_t M = sM;
+  static constexpr std::size_t N = sN;
+  static constexpr std::size_t K = sK;
+
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
+  template <typename Group>
+  using joint_matrix_c = joint_matrix<Group, Tc, use::accumulator, M, N>;
+  template <typename Group>
+  using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
+};
+
+// Validation query (nvidia sm80)
+// Specialization when both types and sizes are given
+template <typename Ta, typename Tb, typename Tc, typename Td, size_t sM,
+          size_t sN, size_t sK>
+struct matrix_params<
+    architecture::nvidia_gpu_sm_80, Ta, Tb, Tc, Td, sM, sN, sK,
+    typename std::enable_if_t<(
+        !std::is_same_v<Ta, void> && !std::is_same_v<Tb, void> &&
+        !std::is_same_v<Tc, void> && !std::is_same_v<Td, void> &&
+        std::is_same_v<Ta, Tb> && sM != 0 && sN != 0 && sK != 0)>> {
+  static_assert(
+      is_combination_valid_cuda_sm70<Ta, Tc, Td>(sM, sN, sK) ||
+          is_combination_valid_cuda_sm72<Ta, Tc, Td>(sM, sN, sK) ||
+          is_combination_valid_cuda_sm80<Ta, Tc, Td>(sM, sN, sK),
+      "Invalid parameters for nvidia sm80, query valid combinations "
+      "using: "
+      "q.get_device().get_info<sycl::info::device::matrix::combinations>()");
+
+  static constexpr std::size_t M = sM;
+  static constexpr std::size_t N = sN;
+  static constexpr std::size_t K = sK;
+
+  template <typename Group, layout Layout>
+  using joint_matrix_a = joint_matrix<Group, Ta, use::a, M, K, Layout>;
+  template <typename Group, layout Layout>
+  using joint_matrix_b = joint_matrix<Group, Tb, use::b, K, N, Layout>;
+  template <typename Group>
+  using joint_matrix_c = joint_matrix<Group, Tc, use::accumulator, M, N>;
+  template <typename Group>
+  using joint_matrix_d = joint_matrix<Group, Td, use::accumulator, M, N>;
+};
+
 } // namespace experimental::matrix
 } // namespace oneapi
 } // namespace ext
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index bd2cc473d24b3..b065799ba7cd8 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -738,6 +738,7 @@ struct get_device_info_impl<
   get(const DeviceImplPtr &Dev) {
     using namespace ext::oneapi::experimental::matrix;
     using namespace ext::oneapi::experimental;
+    backend CurrentBackend = Dev->getBackend();
     architecture DeviceArch = get_device_info_impl<
         ext::oneapi::experimental::architecture,
         ext::oneapi::experimental::info::device::architecture>::get(Dev);
@@ -807,6 +808,118 @@ struct get_device_info_impl<
           {8, 0, 0, 0, 8, 16, matrix_type::bf16, matrix_type::bf16,
            matrix_type::fp32, matrix_type::fp32},
       };
+    else if (architecture::amd_gpu_gfx90a == DeviceArch)
+      return {
+          {0, 0, 0, 32, 32, 8, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 32, 32, 8, matrix_type::sint8, matrix_type::sint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {0, 0, 0, 16, 16, 16, matrix_type::sint8, matrix_type::sint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {0, 0, 0, 32, 32, 8, matrix_type::bf16, matrix_type::bf16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 16, 16, 4, matrix_type::fp64, matrix_type::fp64,
+           matrix_type::fp64, matrix_type::fp64},
+      };
+    else if (backend::ext_oneapi_cuda == CurrentBackend) {
+      // TODO: Tho following can be simplified when comparison of architectures
+      // using < and > will be implemented
+      using oneapi_exp_arch = sycl::ext::oneapi::experimental::architecture;
+      constexpr std::pair<float, oneapi_exp_arch> NvidiaArchNumbs[] = {
+          {5.0, oneapi_exp_arch::nvidia_gpu_sm_50},
+          {5.2, oneapi_exp_arch::nvidia_gpu_sm_52},
+          {5.3, oneapi_exp_arch::nvidia_gpu_sm_53},
+          {6.0, oneapi_exp_arch::nvidia_gpu_sm_60},
+          {6.1, oneapi_exp_arch::nvidia_gpu_sm_61},
+          {6.2, oneapi_exp_arch::nvidia_gpu_sm_62},
+          {7.0, oneapi_exp_arch::nvidia_gpu_sm_70},
+          {7.2, oneapi_exp_arch::nvidia_gpu_sm_72},
+          {7.5, oneapi_exp_arch::nvidia_gpu_sm_75},
+          {8.0, oneapi_exp_arch::nvidia_gpu_sm_80},
+          {8.6, oneapi_exp_arch::nvidia_gpu_sm_86},
+          {8.7, oneapi_exp_arch::nvidia_gpu_sm_87},
+          {8.9, oneapi_exp_arch::nvidia_gpu_sm_89},
+          {9.0, oneapi_exp_arch::nvidia_gpu_sm_90},
+      };
+      auto GetArchNum = [&](const architecture &arch) {
+        for (const auto &Item : NvidiaArchNumbs)
+          if (Item.second == arch)
+            return Item.first;
+        throw sycl::exception(
+            make_error_code(errc::runtime),
+            "The current device architecture is not supported by "
+            "sycl_ext_oneapi_matrix.");
+      };
+      float ComputeCapability = GetArchNum(DeviceArch);
+      std::vector<combination> sm_70_combinations = {
+          {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp16, matrix_type::fp16},
+          {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp16, matrix_type::fp16},
+          {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp16, matrix_type::fp16},
+          {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp16},
+          {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp16},
+          {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp16},
+          {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp16, matrix_type::fp32},
+          {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp16, matrix_type::fp32},
+          {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp16, matrix_type::fp32}};
+      std::vector<combination> sm_72_combinations = {
+          {0, 0, 0, 16, 16, 16, matrix_type::sint8, matrix_type::sint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {0, 0, 0, 8, 32, 16, matrix_type::sint8, matrix_type::sint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {0, 0, 0, 32, 8, 16, matrix_type::sint8, matrix_type::sint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {0, 0, 0, 16, 16, 16, matrix_type::uint8, matrix_type::uint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {0, 0, 0, 8, 32, 16, matrix_type::uint8, matrix_type::uint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {0, 0, 0, 32, 8, 16, matrix_type::uint8, matrix_type::uint8,
+           matrix_type::sint32, matrix_type::sint32}};
+      std::vector<combination> sm_80_combinations = {
+          {0, 0, 0, 16, 16, 8, matrix_type::tf32, matrix_type::tf32,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 8, 32, 16, matrix_type::bf16, matrix_type::bf16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 32, 8, 16, matrix_type::bf16, matrix_type::bf16,
+           matrix_type::fp32, matrix_type::fp32},
+          {0, 0, 0, 8, 8, 4, matrix_type::fp64, matrix_type::fp64,
+           matrix_type::fp64, matrix_type::fp64}};
+      if (ComputeCapability >= 8.0) {
+        sm_80_combinations.insert(sm_80_combinations.end(),
+                                  sm_72_combinations.begin(),
+                                  sm_72_combinations.end());
+        sm_80_combinations.insert(sm_80_combinations.end(),
+                                  sm_70_combinations.begin(),
+                                  sm_70_combinations.end());
+        return sm_80_combinations;
+      } else if (ComputeCapability >= 7.2) {
+        sm_72_combinations.insert(sm_72_combinations.end(),
+                                  sm_70_combinations.begin(),
+                                  sm_70_combinations.end());
+        return sm_72_combinations;
+      } else if (ComputeCapability >= 7.0)
+        return sm_70_combinations;
+    }
     return {};
   }
 };
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index b121ed9a28a84..872fa40c16a08 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -2719,11 +2719,7 @@ checkDevSupportDeviceRequirements(const device &Dev,
     }
   }
 
-  // TODO: remove checks for CUDA and HIP from if-statement below when runtime
-  // query for them in matrix_combinations is implemented
-  if (JointMatrixPropIt &&
-      (Dev.get_backend() != sycl::backend::ext_oneapi_cuda) &&
-      (Dev.get_backend() != sycl::backend::ext_oneapi_hip)) {
+  if (JointMatrixPropIt) {
     std::vector<ext::oneapi::experimental::matrix::combination> Combinations =
         Dev.get_info<
             ext::oneapi::experimental::info::device::matrix_combinations>();
@@ -2747,11 +2743,7 @@ checkDevSupportDeviceRequirements(const device &Dev,
       return Result.value();
   }
 
-  // TODO: remove checks for CUDA and HIP from if-statement below when runtime
-  // query for them in matrix_combinations is implemented
-  if (JointMatrixMadPropIt &&
-      (Dev.get_backend() != sycl::backend::ext_oneapi_cuda) &&
-      (Dev.get_backend() != sycl::backend::ext_oneapi_hip)) {
+  if (JointMatrixMadPropIt) {
     std::vector<ext::oneapi::experimental::matrix::combination> Combinations =
         Dev.get_info<
             ext::oneapi::experimental::info::device::matrix_combinations>();
diff --git a/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp b/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp
new file mode 100644
index 0000000000000..2eef5ee1ef933
--- /dev/null
+++ b/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp
@@ -0,0 +1,58 @@
+// REQUIRES: gpu-amd-gfx90a
+// RUN: %{build} -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx90a -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+bool find_combination(const combination &comb,
+                      const std::vector<combination> &expected_combinations) {
+  return std::find_if(expected_combinations.begin(),
+                      expected_combinations.end(),
+                      [&comb](const auto &expected_comb) {
+                        return (comb.max_msize == expected_comb.max_msize &&
+                                comb.max_nsize == expected_comb.max_nsize &&
+                                comb.max_ksize == expected_comb.max_ksize &&
+                                comb.msize == expected_comb.msize &&
+                                comb.nsize == expected_comb.nsize &&
+                                comb.ksize == expected_comb.ksize &&
+                                comb.atype == expected_comb.atype &&
+                                comb.btype == expected_comb.btype &&
+                                comb.ctype == expected_comb.ctype &&
+                                comb.dtype == expected_comb.dtype);
+                      }) != expected_combinations.end();
+}
+
+int main() {
+  std::vector<combination> expected_combinations = {
+      {0, 0, 0, 32, 32, 8, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 32, 32, 8, matrix_type::sint8, matrix_type::sint8,
+       matrix_type::sint32, matrix_type::sint32},
+      {0, 0, 0, 16, 16, 16, matrix_type::sint8, matrix_type::sint8,
+       matrix_type::sint32, matrix_type::sint32},
+      {0, 0, 0, 32, 32, 8, matrix_type::bf16, matrix_type::bf16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 16, 16, 4, matrix_type::fp64, matrix_type::fp64,
+       matrix_type::fp64, matrix_type::fp64}};
+
+  sycl::queue q;
+  std::vector<combination> actual_combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  assert(actual_combinations.size() == expected_combinations.size() &&
+         "Number of combinations is not equal.");
+
+  for (auto &comb : actual_combinations) {
+    assert(find_combination(comb, expected_combinations) &&
+           "Some values in matrix runtime query for gfx90a are not expected.");
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/Matrix/runtime_query_tensorcores.cpp b/sycl/test-e2e/Matrix/runtime_query_tensorcores.cpp
new file mode 100644
index 0000000000000..e69c512ac400e
--- /dev/null
+++ b/sycl/test-e2e/Matrix/runtime_query_tensorcores.cpp
@@ -0,0 +1,120 @@
+// REQUIRES: cuda
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+bool find_combination(const combination &comb,
+                      const std::vector<combination> &expected_combinations) {
+  return std::find_if(expected_combinations.begin(),
+                      expected_combinations.end(),
+                      [&comb](const auto &expected_comb) {
+                        return (comb.max_msize == expected_comb.max_msize &&
+                                comb.max_nsize == expected_comb.max_nsize &&
+                                comb.max_ksize == expected_comb.max_ksize &&
+                                comb.msize == expected_comb.msize &&
+                                comb.nsize == expected_comb.nsize &&
+                                comb.ksize == expected_comb.ksize &&
+                                comb.atype == expected_comb.atype &&
+                                comb.btype == expected_comb.btype &&
+                                comb.ctype == expected_comb.ctype &&
+                                comb.dtype == expected_comb.dtype);
+                      }) != expected_combinations.end();
+}
+
+int main() {
+
+  sycl::queue Q;
+  auto ComputeCapability =
+      std::stof(Q.get_device().get_info<sycl::info::device::backend_version>());
+
+  std::vector<combination> sm_70_combinations = {
+      {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp16, matrix_type::fp16},
+      {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp16, matrix_type::fp16},
+      {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp16, matrix_type::fp16},
+      {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp32, matrix_type::fp16},
+      {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp32, matrix_type::fp16},
+      {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp32, matrix_type::fp16},
+      {0, 0, 0, 16, 16, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp16, matrix_type::fp32},
+      {0, 0, 0, 8, 32, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp16, matrix_type::fp32},
+      {0, 0, 0, 32, 8, 16, matrix_type::fp16, matrix_type::fp16,
+       matrix_type::fp16, matrix_type::fp32}};
+
+  std::vector<combination> sm_72_combinations = {
+      {0, 0, 0, 16, 16, 16, matrix_type::sint8, matrix_type::sint8,
+       matrix_type::sint32, matrix_type::sint32},
+      {0, 0, 0, 8, 32, 16, matrix_type::sint8, matrix_type::sint8,
+       matrix_type::sint32, matrix_type::sint32},
+      {0, 0, 0, 32, 8, 16, matrix_type::sint8, matrix_type::sint8,
+       matrix_type::sint32, matrix_type::sint32},
+      {0, 0, 0, 16, 16, 16, matrix_type::uint8, matrix_type::uint8,
+       matrix_type::sint32, matrix_type::sint32},
+      {0, 0, 0, 8, 32, 16, matrix_type::uint8, matrix_type::uint8,
+       matrix_type::sint32, matrix_type::sint32},
+      {0, 0, 0, 32, 8, 16, matrix_type::uint8, matrix_type::uint8,
+       matrix_type::sint32, matrix_type::sint32}};
+
+  std::vector<combination> sm_80_combinations = {
+      {0, 0, 0, 16, 16, 8, matrix_type::tf32, matrix_type::tf32,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 16, 16, 16, matrix_type::bf16, matrix_type::bf16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 8, 32, 16, matrix_type::bf16, matrix_type::bf16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 32, 8, 16, matrix_type::bf16, matrix_type::bf16,
+       matrix_type::fp32, matrix_type::fp32},
+      {0, 0, 0, 8, 8, 4, matrix_type::fp64, matrix_type::fp64,
+       matrix_type::fp64, matrix_type::fp64}};
+
+  std::vector<combination> expected_combinations;
+
+  if (ComputeCapability >= 8.0) {
+    std::move(sm_70_combinations.begin(), sm_70_combinations.end(),
+              std::back_inserter(expected_combinations));
+    std::move(sm_72_combinations.begin(), sm_72_combinations.end(),
+              std::back_inserter(expected_combinations));
+    std::move(sm_80_combinations.begin(), sm_80_combinations.end(),
+              std::back_inserter(expected_combinations));
+  } else if (ComputeCapability >= 7.2) {
+    std::move(sm_70_combinations.begin(), sm_70_combinations.end(),
+              std::back_inserter(expected_combinations));
+    std::move(sm_72_combinations.begin(), sm_72_combinations.end(),
+              std::back_inserter(expected_combinations));
+  } else if (ComputeCapability >= 7.0) {
+    std::move(sm_70_combinations.begin(), sm_70_combinations.end(),
+              std::back_inserter(expected_combinations));
+  } else {
+    return 0;
+  }
+
+  std::vector<combination> actual_combinations =
+      Q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  assert(actual_combinations.size() == expected_combinations.size() &&
+         "Number of combinations is not equal.");
+
+  for (auto &comb : actual_combinations) {
+    assert(find_combination(comb, expected_combinations) &&
+           "Some values in matrix runtime query for CUDA are not expected.");
+  }
+
+  return 0;
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-compile-query-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-compile-query-test.cpp
new file mode 100644
index 0000000000000..56dfbf173fafc
--- /dev/null
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-compile-query-test.cpp
@@ -0,0 +1,33 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s -o compile-query-cuda
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+int main() {
+  // Compile-time query to validate the matrix parameters
+  using myparams = matrix_params<architecture::nvidia_gpu_sm_70, half, half,
+                                 float, float, 32, 8, 16>;
+
+  size_t dmsize = myparams::M;
+  size_t dnsize = myparams::N;
+  size_t dksize = myparams::K;
+  std::cout
+      << "sizes of Nvidia gpu sm70 matrix_params chosen by the user are: M "
+      << dmsize << " N " << dnsize << " K " << dksize << std::endl;
+
+  // Sizes-only compile-time query: types are given, generate default sizes
+  using myparams2 =
+      matrix_params<architecture::nvidia_gpu_sm_70, half, half, float, float>;
+  myparams2 p;
+  dmsize = myparams2::M;
+  dnsize = myparams2::N;
+  dksize = myparams2::K;
+  std::cout << "default Nvidia gpu sm70 sizes matrix_params are: M " << dmsize
+            << " N " << dnsize << " K " << dksize << std::endl;
+  return 0;
+}
diff --git a/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp b/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp
new file mode 100644
index 0000000000000..4337cff1fda2b
--- /dev/null
+++ b/sycl/test/check_device_code/hip/matrix/compile-query-hip-gfx90a.cpp
@@ -0,0 +1,33 @@
+// REQUIRES: hip
+// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o compile-query-hip
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+int main() {
+  // Compile-time query to validate the matrix parameters
+  using myparams = matrix_params<architecture::amd_gpu_gfx90a, int8_t, int8_t,
+                                 int32_t, int32_t, 32, 32, 8>;
+
+  size_t dmsize = myparams::M;
+  size_t dnsize = myparams::N;
+  size_t dksize = myparams::K;
+  std::cout
+      << "sizes of AMD gpu gfx90a matrix_params chosen by the user are: M "
+      << dmsize << " N " << dnsize << " K " << dksize << std::endl;
+
+  // Sizes-only compile-time query: types are given, generate default sizes
+  using myparams2 = matrix_params<architecture::amd_gpu_gfx90a, int8_t, int8_t,
+                                  int32_t, int32_t>;
+  myparams2 p;
+  dmsize = myparams2::M;
+  dnsize = myparams2::N;
+  dksize = myparams2::K;
+  std::cout << "default AMD gpu gfx90a sizes matrix_params  are: M " << dmsize
+            << " N " << dnsize << " K " << dksize << std::endl;
+  return 0;
+};