From 05d008dd933959dd56d059b0881bacbe237e0488 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 25 Jan 2023 16:44:53 -0500 Subject: [PATCH] Address deprecations in oneAPI 2023.0.0 --- core/src/SYCL/Kokkos_SYCL.cpp | 17 ------- core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp | 28 +++-------- core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp | 9 ++-- core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp | 50 ++++++++----------- core/unit_test/sycl/TestSYCL_InterOp_Init.cpp | 3 +- .../sycl/TestSYCL_InterOp_Init_Context.cpp | 6 +-- .../sycl/TestSYCL_InterOp_Streams.cpp | 3 +- 7 files changed, 36 insertions(+), 80 deletions(-) diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index e38b011c89..8ef4b13012 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -144,7 +144,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, using namespace sycl::info; return os << "Name: " << device.get_info() << "\nDriver Version: " << device.get_info() - << "\nIs Host: " << device.is_host() << "\nIs CPU: " << device.is_cpu() << "\nIs GPU: " << device.is_gpu() << "\nIs Accelerator: " << device.is_accelerator() @@ -184,7 +183,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << "\nNative Vector Width Half: " << device.get_info() << "\nAddress Bits: " << device.get_info() - << "\nImage Support: " << device.get_info() << "\nMax Mem Alloc Size: " << device.get_info() << "\nMax Read Image Args: " @@ -217,26 +215,11 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << "\nLocal Mem Size: " << device.get_info() << "\nError Correction Support: " << device.get_info() - << "\nHost Unified Memory: " - << device.get_info() << "\nProfiling Timer Resolution: " << device.get_info() - << "\nIs Endian Little: " - << device.get_info() << "\nIs Available: " << device.get_info() - << "\nIs Compiler Available: " - << device.get_info() - << "\nIs Linker Available: " - << device.get_info() - << "\nQueue Profiling: " - << device.get_info() << "\nVendor: " << device.get_info() - << "\nProfile: " << device.get_info() << "\nVersion: " << device.get_info() - << "\nPrintf Buffer Size: " - << device.get_info() - << "\nPreferred Interop User Sync: " - << device.get_info() << "\nPartition Max Sub Devices: " << device.get_info() << "\nReference Count: " diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp index c7959c1c1c..5144e57a71 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp @@ -293,12 +293,8 @@ class ParallelReduce, ReducerType, instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = - [&](sycl::accessor - local_mem, - sycl::accessor - num_teams_done, + [&](sycl::local_accessor local_mem, + sycl::local_accessor num_teams_done, sycl::device_ptr results_ptr) { const auto begin = policy.begin(); @@ -410,9 +406,7 @@ class ParallelReduce, ReducerType, }; auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { - sycl::accessor - num_teams_done(1, cgh); + sycl::local_accessor num_teams_done(1, cgh); auto dummy_reduction_lambda = reduction_lambda_factory({1, cgh}, num_teams_done, nullptr); @@ -453,10 +447,8 @@ class ParallelReduce, ReducerType, wgroup_size - 1) / wgroup_size; - sycl::accessor - local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u), - cgh); + sycl::local_accessor local_mem( + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); cgh.depends_on(memcpy_events); @@ -665,13 +657,9 @@ class ParallelReduce, ReducerType, if (size > 1) { auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { - sycl::accessor - local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u), - cgh); - sycl::accessor - num_teams_done(1, cgh); + sycl::local_accessor local_mem( + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); + sycl::local_accessor num_teams_done(1, cgh); const BarePolicy bare_policy = m_policy; diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp index cf651ced95..76c73b3452 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -136,11 +136,10 @@ class ParallelScanSYCLBase { q.get_device() .template get_info() .front(); - sycl::accessor - local_mem(sycl::range<1>((wgroup_size + min_subgroup_size - 1) / - min_subgroup_size), - cgh); + sycl::local_accessor local_mem( + sycl::range<1>((wgroup_size + min_subgroup_size - 1) / + min_subgroup_size), + cgh); cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), diff --git a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 601580b2d8..489180361f 100644 --- a/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -398,12 +398,10 @@ class ParallelFor, auto parallel_for_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues - sycl::accessor - team_scratch_memory_L0( - sycl::range<1>( - std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), - cgh); + sycl::local_accessor team_scratch_memory_L0( + sycl::range<1>( + std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), + cgh); // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; @@ -432,8 +430,7 @@ class ParallelFor, auto max_sg_size = kernel .get_info( - q.get_device(), - sycl::range<3>(m_team_size, m_vector_size, 1)); + q.get_device()); auto final_vector_size = std::min(m_vector_size, max_sg_size); // FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to // be used gives a runtime error. @@ -592,12 +589,10 @@ class ParallelReduce, auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues - sycl::accessor - team_scratch_memory_L0( - sycl::range<1>( - std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), - cgh); + sycl::local_accessor team_scratch_memory_L0( + sycl::range<1>( + std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), + cgh); // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; @@ -645,12 +640,10 @@ class ParallelReduce, // FIXME_SYCL accessors seem to need a size greater than zero at least // for host queues - sycl::accessor - team_scratch_memory_L0( - sycl::range<1>( - std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), - cgh); + sycl::local_accessor team_scratch_memory_L0( + sycl::range<1>( + std::max(m_scratch_size[0] + m_shmem_begin, size_t(1))), + cgh); // Avoid capturing *this since it might not be trivially copyable const auto shmem_begin = m_shmem_begin; @@ -658,9 +651,7 @@ class ParallelReduce, sycl::device_ptr const global_scratch_ptr = m_global_scratch_ptr; auto team_reduction_factory = - [&](sycl::accessor - local_mem, + [&](sycl::local_accessor local_mem, sycl::device_ptr results_ptr) { sycl::global_ptr device_accessible_result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; @@ -793,7 +784,7 @@ class ParallelReduce, }(); auto max_sg_size = kernel.get_info< sycl::info::kernel_device_specific::max_sub_group_size>( - q.get_device(), sycl::range<3>(m_team_size, m_vector_size, 1)); + q.get_device()); auto final_vector_size = std::min(m_vector_size, max_sg_size); // FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to // be used gives a runtime error. @@ -802,12 +793,11 @@ class ParallelReduce, auto wgroup_size = m_team_size * final_vector_size; std::size_t size = std::size_t(m_league_size) * wgroup_size; - sycl::accessor - local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + - (sizeof(unsigned int) + sizeof(value_type) - 1) / - sizeof(value_type), - cgh); + sycl::local_accessor local_mem( + sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + + (sizeof(unsigned int) + sizeof(value_type) - 1) / + sizeof(value_type), + cgh); const auto init_size = std::max((size + wgroup_size - 1) / wgroup_size, 1); diff --git a/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp index 25c5c9a50c..4b2530316d 100644 --- a/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp +++ b/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp @@ -29,8 +29,7 @@ TEST(sycl, raw_sycl_interop) { Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); constexpr int n = 100; int* p = sycl::malloc_device(n, queue); { diff --git a/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp index 336a5d59c3..bbd3d2af94 100644 --- a/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp +++ b/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp @@ -27,8 +27,7 @@ TEST(sycl, raw_sycl_interop_context_1) { Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); constexpr int n = 100; int* p = sycl::malloc_device(n, queue); @@ -61,8 +60,7 @@ TEST(sycl, raw_sycl_interop_context_2) { Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); constexpr int n = 100; Kokkos::Experimental::SYCL space(queue); diff --git a/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp index 13810d861c..0cfaab8813 100644 --- a/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp +++ b/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp @@ -25,8 +25,7 @@ TEST(sycl, raw_sycl_queues) { Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); - sycl::default_selector device_selector; - sycl::queue queue(default_context, device_selector); + sycl::queue queue(default_context, sycl::default_selector_v); int* p = sycl::malloc_device(100, queue); using MemorySpace = typename TEST_EXECSPACE::memory_space;