From 6cc026b97ca5ba0396b3a6e06706840260a90f3a Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 25 Mar 2021 11:11:36 -0700 Subject: [PATCH] [SYCL] Runtime property lists on kernel invocation functions Add an optional parameter to kernel invocation functions which accepts property list. Such properties are going to affect the way the kernel is invoked. Example is a property to set cache/SLM size for a kernel invocation which is going to be implemented in upcoming PR. --- sycl/CMakeLists.txt | 2 +- sycl/include/CL/sycl/ONEAPI/reduction.hpp | 23 +++ sycl/include/CL/sycl/detail/cg.hpp | 12 +- sycl/include/CL/sycl/handler.hpp | 189 +++++++++++++----- sycl/source/detail/scheduler/commands.cpp | 13 +- sycl/source/detail/scheduler/commands.hpp | 4 +- sycl/source/handler.cpp | 4 +- sycl/test/abi/layout_handler.cpp | 18 +- sycl/test/abi/symbol_size_alignment.cpp | 2 +- .../scheduler/StreamInitDependencyOnHost.cpp | 14 +- 10 files changed, 206 insertions(+), 75 deletions(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 53bec90a35bb6..48044a804eb4e 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -15,7 +15,7 @@ include(AddSYCLExecutable) set(SYCL_MAJOR_VERSION 5) set(SYCL_MINOR_VERSION 1) set(SYCL_PATCH_VERSION 0) -set(SYCL_DEV_ABI_VERSION 0) +set(SYCL_DEV_ABI_VERSION 1) if (SYCL_ADD_DEV_VERSION_POSTFIX) set(SYCL_VERSION_POSTFIX "-${SYCL_DEV_ABI_VERSION}") endif() diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index 3d3078ac95bc9..a73ea6ab687f7 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -432,6 +432,29 @@ template struct AreAllButLastReductions { static constexpr bool value = !std::is_base_of::value; }; +/// Predicate returning true if all template type parameters except the last two +/// are reductions. +template +struct AreAllButLastTwoReductions { + static constexpr bool value = + std::is_base_of::value && + AreAllButLastTwoReductions::value; +}; + +/// Helper specialization of AreAllButLastTwoReductions for two elements. +/// Returns true if the template parameters are not a reduction. +template struct AreAllButLastTwoReductions { + static constexpr bool value = + !std::is_base_of::value && + !std::is_base_of::value; +}; + +/// Helper specialization of AreAllButLastTwoReductions for one element only. +/// Returns true if the template parameter is not a reduction. +template struct AreAllButLastTwoReductions { + static constexpr bool value = !std::is_base_of::value; +}; + /// This class encapsulates the reduction variable/accessor, /// the reduction operator and an optional operator identity. template MHostKernel; shared_ptr_class MSyclKernel; vector_class MArgs; @@ -128,7 +129,8 @@ class CGExecKernel : public CG { detail::OSModuleHandle MOSModuleHandle; vector_class> MStreams; - CGExecKernel(NDRDescT NDRDesc, unique_ptr_class HKernel, + CGExecKernel(NDRDescT NDRDesc, const property_list &PropList, + unique_ptr_class HKernel, shared_ptr_class SyclKernel, vector_class> ArgsStorage, vector_class AccStorage, @@ -142,10 +144,10 @@ class CGExecKernel : public CG { : CG(Type, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), std::move(loc)), - MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)), - MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)), - MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle), - MStreams(std::move(Streams)) { + MNDRDesc(std::move(NDRDesc)), MPropList(PropList), + MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)), + MArgs(std::move(Args)), MKernelName(std::move(KernelName)), + MOSModuleHandle(OSModuleHandle), MStreams(std::move(Streams)) { assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) && "Wrong type of exec kernel CG."); } diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index ef60964e3d332..5f1d10aa17f5e 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -270,6 +270,7 @@ std::tuple...> tuple_select_elements(TupleT Tuple, std::index_sequence); template struct AreAllButLastReductions; +template struct AreAllButLastTwoReductions; } // namespace detail } // namespace ONEAPI @@ -716,8 +717,8 @@ class __SYCL_EXPORT handler { /// \param NumWorkItems is a range defining indexing space. /// \param KernelFunc is a SYCL kernel function. template - void parallel_for_lambda_impl(range NumWorkItems, - KernelType KernelFunc) { + void parallel_for_lambda_impl(range NumWorkItems, KernelType KernelFunc, + const property_list &PropList = {}) { throwIfActionIsCreated(); using LambdaArgType = sycl::detail::lambda_arg_type>; @@ -813,10 +814,12 @@ class __SYCL_EXPORT handler { range AdjustedRange = NumWorkItems; AdjustedRange.set_range_dim0(NewValX); #ifdef __SYCL_DEVICE_ONLY__ + (void)PropList; kernel_parallel_for(Wrapper); #else detail::checkValueRange(AdjustedRange); MNDRDesc.set(std::move(AdjustedRange)); + MPropList = PropList; StoreLambda( std::move(Wrapper)); MCGType = detail::CG::KERNEL; @@ -828,10 +831,12 @@ class __SYCL_EXPORT handler { { #ifdef __SYCL_DEVICE_ONLY__ (void)NumWorkItems; + (void)PropList; kernel_parallel_for(KernelFunc); #else detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); + MPropList = PropList; StoreLambda( std::move(KernelFunc)); MCGType = detail::CG::KERNEL; @@ -847,12 +852,14 @@ class __SYCL_EXPORT handler { /// \param NumWorkItems is a range defining indexing space. /// \param Kernel is a SYCL kernel function. template - void parallel_for_impl(range NumWorkItems, kernel Kernel) { + void parallel_for_impl(range NumWorkItems, kernel Kernel, + const property_list &PropList = {}) { throwIfActionIsCreated(); verifyKernelInvoc(Kernel); MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); + MPropList = PropList; MCGType = detail::CG::KERNEL; extractArgsAndReqs(); MKernelName = getKernelName(); @@ -996,19 +1003,22 @@ class __SYCL_EXPORT handler { /// \param KernelFunc is a SYCL kernel function. template #ifdef __SYCL_NONCONST_FUNCTOR__ - void single_task(KernelType KernelFunc) { + void single_task(KernelType KernelFunc, const property_list &PropList = {}) { #else - void single_task(const KernelType &KernelFunc) { + void single_task(const KernelType &KernelFunc, + const property_list &PropList = {}) { #endif throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ + (void)PropList; kernel_single_task(KernelFunc); #else // No need to check if range is out of INT_MAX limits as it's compile-time // known constant. MNDRDesc.set(range<1>{1}); + MPropList = PropList; StoreLambda(KernelFunc); MCGType = detail::CG::KERNEL; @@ -1017,29 +1027,38 @@ class __SYCL_EXPORT handler { template #ifdef __SYCL_NONCONST_FUNCTOR__ - void parallel_for(range<1> NumWorkItems, KernelType KernelFunc) { + void parallel_for(range<1> NumWorkItems, KernelType KernelFunc, + const property_list &PropList = {}) { #else - void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc) { + void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc, + const property_list &PropList = {}) { #endif - parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc)); + parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc), + PropList); } template #ifdef __SYCL_NONCONST_FUNCTOR__ - void parallel_for(range<2> NumWorkItems, KernelType KernelFunc) { + void parallel_for(range<2> NumWorkItems, KernelType KernelFunc, + const property_list &PropList = {}) { #else - void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc) { + void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc, + const property_list &PropList = {}) { #endif - parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc)); + parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc), + PropList); } template #ifdef __SYCL_NONCONST_FUNCTOR__ - void parallel_for(range<3> NumWorkItems, KernelType KernelFunc) { + void parallel_for(range<3> NumWorkItems, KernelType KernelFunc, + const property_list &PropList = {}) { #else - void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc) { + void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc, + const property_list &PropList = {}) { #endif - parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc)); + parallel_for_lambda_impl(NumWorkItems, std::move(KernelFunc), + PropList); } /// Defines and invokes a SYCL kernel on host device. @@ -1110,7 +1129,8 @@ class __SYCL_EXPORT handler { template void parallel_for(range NumWorkItems, id WorkItemOffset, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1118,8 +1138,10 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)NumWorkItems; (void)WorkItemOffset; + (void)PropList; kernel_parallel_for(KernelFunc); #else + MPropList = PropList; detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); StoreLambda(std::move(KernelFunc)); @@ -1141,8 +1163,8 @@ class __SYCL_EXPORT handler { /// \param KernelFunc is a SYCL kernel function. template - void parallel_for(nd_range ExecutionRange, - _KERNELFUNCPARAM(KernelFunc)) { + void parallel_for(nd_range ExecutionRange, _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1150,8 +1172,10 @@ class __SYCL_EXPORT handler { sycl::detail::lambda_arg_type>; #ifdef __SYCL_DEVICE_ONLY__ (void)ExecutionRange; + (void)PropList; kernel_parallel_for(KernelFunc); #else + MPropList = PropList; detail::checkValueRange(ExecutionRange); MNDRDesc.set(std::move(ExecutionRange)); StoreLambda(std::move(KernelFunc)); @@ -1168,7 +1192,9 @@ class __SYCL_EXPORT handler { detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { + MPropList = PropList; ONEAPI::detail::reduCGFunc(*this, KernelFunc, Range, Redu, Redu.getUserAccessor()); } @@ -1182,7 +1208,9 @@ class __SYCL_EXPORT handler { detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { + MPropList = PropList; ONEAPI::detail::reduCGFunc(*this, KernelFunc, Range, Redu, Redu.getUSMPointer()); } @@ -1202,7 +1230,9 @@ class __SYCL_EXPORT handler { detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { + MPropList = PropList; shared_ptr_class QueueCopy = MQueue; auto RWAcc = Redu.getReadWriteScalarAcc(*this); ONEAPI::detail::reduCGFunc(*this, KernelFunc, Range, Redu, @@ -1238,7 +1268,8 @@ class __SYCL_EXPORT handler { int Dims, typename Reduction> detail::enable_if_t parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { // This parallel_for() is lowered to the following sequence: // 1) Call a kernel that a) call user's lambda function and b) performs // one iteration of reduction, storing the partial reductions/sums @@ -1260,6 +1291,7 @@ class __SYCL_EXPORT handler { // the main kernel, but simply generate Range.get_global_range.size() number // of partial sums, leaving the reduction work to the additional/aux // kernels. + MPropList = PropList; constexpr bool HFR = Reduction::has_fast_reduce; size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type); // TODO: currently the maximal work group size is determined for the given @@ -1302,20 +1334,16 @@ class __SYCL_EXPORT handler { } // end while (NWorkItems > 1) } - // This version of parallel_for may handle one or more reductions packed in - // \p Rest argument. Note thought that the last element in \p Rest pack is - // the kernel function. - // TODO: this variant is currently enabled for 2+ reductions only as the - // versions handling 1 reduction variable are more efficient right now. - template - std::enable_if_t<(sizeof...(RestT) >= 3 && - ONEAPI::detail::AreAllButLastReductions::value)> - parallel_for(nd_range Range, RestT... Rest) { + void parallel_for_impl(nd_range Range, RestT... Rest) { std::tuple ArgsTuple(Rest...); constexpr size_t NumArgs = sizeof...(RestT); - auto KernelFunc = std::get(ArgsTuple); - auto ReduIndices = std::make_index_sequence(); + constexpr size_t Offset = hasPropList ? 2 : 1; + auto KernelFunc = std::get(ArgsTuple); + auto ReduIndices = std::make_index_sequence(); auto ReduTuple = ONEAPI::detail::tuple_select_elements(ArgsTuple, ReduIndices); @@ -1350,6 +1378,36 @@ class __SYCL_EXPORT handler { } // end while (NWorkItems > 1) } + // This version of parallel_for may handle one or more reductions packed in + // \p Rest argument. Note though that the last element in \p Rest pack is + // the kernel function. + // TODO: this variant is currently enabled for 2+ reductions only as the + // versions handling 1 reduction variable are more efficient right now. + template + std::enable_if_t<(sizeof...(RestT) >= 3 && + ONEAPI::detail::AreAllButLastReductions::value)> + parallel_for(nd_range Range, RestT... Rest) { + parallel_for_impl(Range, Rest...); + } + + // This version of parallel_for may handle one or more reductions packed in + // \p Rest argument. Note though that the last two elements in \p Rest pack + // are the kernel function and the property list. + // TODO: this variant is currently enabled for 2+ reductions only as the + // versions handling 1 reduction variable are more efficient right now. + template + std::enable_if_t< + (sizeof...(RestT) >= 4 && + ONEAPI::detail::AreAllButLastTwoReductions::value)> + parallel_for(nd_range Range, RestT... Rest) { + std::tuple ArgsTuple(Rest...); + constexpr size_t NumArgs = sizeof...(RestT); + MPropList = std::get(ArgsTuple); + parallel_for_impl(Range, Rest...); + } + /// Hierarchical kernel invocation method of a kernel defined as a lambda /// encoding the body of each work-group to launch. /// @@ -1363,7 +1421,8 @@ class __SYCL_EXPORT handler { template void parallel_for_work_group(range NumWorkGroups, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1371,10 +1430,12 @@ class __SYCL_EXPORT handler { sycl::detail::lambda_arg_type>; #ifdef __SYCL_DEVICE_ONLY__ (void)NumWorkGroups; + (void)PropList; kernel_parallel_for_work_group(KernelFunc); #else detail::checkValueRange(NumWorkGroups); MNDRDesc.setNumWorkGroups(NumWorkGroups); + MPropList = PropList; StoreLambda(std::move(KernelFunc)); MCGType = detail::CG::KERNEL; #endif // __SYCL_DEVICE_ONLY__ @@ -1396,7 +1457,8 @@ class __SYCL_EXPORT handler { int Dims> void parallel_for_work_group(range NumWorkGroups, range WorkGroupSize, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1405,12 +1467,14 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)NumWorkGroups; (void)WorkGroupSize; + (void)PropList; kernel_parallel_for_work_group(KernelFunc); #else nd_range ExecRange = nd_range(NumWorkGroups * WorkGroupSize, WorkGroupSize); detail::checkValueRange(ExecRange); MNDRDesc.set(std::move(ExecRange)); + MPropList = PropList; StoreLambda(std::move(KernelFunc)); MCGType = detail::CG::KERNEL; #endif // __SYCL_DEVICE_ONLY__ @@ -1422,28 +1486,32 @@ class __SYCL_EXPORT handler { /// cannot be called on host. /// /// \param Kernel is a SYCL kernel object. - void single_task(kernel Kernel) { + void single_task(kernel Kernel, const property_list &PropList = {}) { throwIfActionIsCreated(); verifyKernelInvoc(Kernel); // No need to check if range is out of INT_MAX limits as it's compile-time // known constant MNDRDesc.set(range<1>{1}); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; extractArgsAndReqs(); MKernelName = getKernelName(); } - void parallel_for(range<1> NumWorkItems, kernel Kernel) { - parallel_for_impl(NumWorkItems, Kernel); + void parallel_for(range<1> NumWorkItems, kernel Kernel, + const property_list &PropList = {}) { + parallel_for_impl(NumWorkItems, Kernel, PropList); } - void parallel_for(range<2> NumWorkItems, kernel Kernel) { - parallel_for_impl(NumWorkItems, Kernel); + void parallel_for(range<2> NumWorkItems, kernel Kernel, + const property_list &PropList = {}) { + parallel_for_impl(NumWorkItems, Kernel, PropList); } - void parallel_for(range<3> NumWorkItems, kernel Kernel) { - parallel_for_impl(NumWorkItems, Kernel); + void parallel_for(range<3> NumWorkItems, kernel Kernel, + const property_list &PropList = {}) { + parallel_for_impl(NumWorkItems, Kernel, PropList); } /// Defines and invokes a SYCL kernel function for the specified range and @@ -1456,12 +1524,13 @@ class __SYCL_EXPORT handler { /// \param Kernel is a SYCL kernel function. template void parallel_for(range NumWorkItems, id WorkItemOffset, - kernel Kernel) { + kernel Kernel, const property_list &PropList = {}) { throwIfActionIsCreated(); verifyKernelInvoc(Kernel); MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); + MPropList = PropList; MCGType = detail::CG::KERNEL; extractArgsAndReqs(); MKernelName = getKernelName(); @@ -1475,12 +1544,15 @@ class __SYCL_EXPORT handler { /// \param NDRange is a ND-range defining global and local sizes as /// well as offset. /// \param Kernel is a SYCL kernel function. - template void parallel_for(nd_range NDRange, kernel Kernel) { + template + void parallel_for(nd_range NDRange, kernel Kernel, + const property_list &PropList = {}) { throwIfActionIsCreated(); verifyKernelInvoc(Kernel); MKernel = detail::getSyclObjImpl(std::move(Kernel)); detail::checkValueRange(NDRange); MNDRDesc.set(std::move(NDRange)); + MPropList = PropList; MCGType = detail::CG::KERNEL; extractArgsAndReqs(); MKernelName = getKernelName(); @@ -1493,17 +1565,20 @@ class __SYCL_EXPORT handler { /// \param KernelFunc is a lambda that is used if device, queue is bound to, /// is a host device. template - void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc)) { + void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; #ifdef __SYCL_DEVICE_ONLY__ (void)Kernel; + (void)PropList; kernel_single_task(KernelFunc); #else // No need to check if range is out of INT_MAX limits as it's compile-time // known constant MNDRDesc.set(range<1>{1}); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { @@ -1533,7 +1608,8 @@ class __SYCL_EXPORT handler { template void parallel_for(kernel Kernel, range NumWorkItems, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1541,10 +1617,12 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)Kernel; (void)NumWorkItems; + (void)PropList; kernel_parallel_for(KernelFunc); #else detail::checkValueRange(NumWorkItems); MNDRDesc.set(std::move(NumWorkItems)); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { @@ -1568,7 +1646,8 @@ class __SYCL_EXPORT handler { template void parallel_for(kernel Kernel, range NumWorkItems, - id WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) { + id WorkItemOffset, _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1577,10 +1656,12 @@ class __SYCL_EXPORT handler { (void)Kernel; (void)NumWorkItems; (void)WorkItemOffset; + (void)PropList; kernel_parallel_for(KernelFunc); #else detail::checkValueRange(NumWorkItems, WorkItemOffset); MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset)); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { @@ -1604,7 +1685,8 @@ class __SYCL_EXPORT handler { template void parallel_for(kernel Kernel, nd_range NDRange, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1613,10 +1695,12 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)Kernel; (void)NDRange; + (void)PropList; kernel_parallel_for(KernelFunc); #else detail::checkValueRange(NDRange); MNDRDesc.set(std::move(NDRange)); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); MCGType = detail::CG::KERNEL; if (!MIsHost && !lambdaAndKernelHaveEqualName()) { @@ -1644,7 +1728,8 @@ class __SYCL_EXPORT handler { template void parallel_for_work_group(kernel Kernel, range NumWorkGroups, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1653,10 +1738,12 @@ class __SYCL_EXPORT handler { #ifdef __SYCL_DEVICE_ONLY__ (void)Kernel; (void)NumWorkGroups; + (void)PropList; kernel_parallel_for_work_group(KernelFunc); #else detail::checkValueRange(NumWorkGroups); MNDRDesc.setNumWorkGroups(NumWorkGroups); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); StoreLambda(std::move(KernelFunc)); MCGType = detail::CG::KERNEL; @@ -1682,7 +1769,8 @@ class __SYCL_EXPORT handler { int Dims> void parallel_for_work_group(kernel Kernel, range NumWorkGroups, range WorkGroupSize, - _KERNELFUNCPARAM(KernelFunc)) { + _KERNELFUNCPARAM(KernelFunc), + const property_list &PropList = {}) { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; @@ -1692,12 +1780,14 @@ class __SYCL_EXPORT handler { (void)Kernel; (void)NumWorkGroups; (void)WorkGroupSize; + (void)PropList; kernel_parallel_for_work_group(KernelFunc); #else nd_range ExecRange = nd_range(NumWorkGroups * WorkGroupSize, WorkGroupSize); detail::checkValueRange(ExecRange); MNDRDesc.set(std::move(ExecRange)); + MPropList = PropList; MKernel = detail::getSyclObjImpl(std::move(Kernel)); StoreLambda(std::move(KernelFunc)); MCGType = detail::CG::KERNEL; @@ -2066,6 +2156,7 @@ class __SYCL_EXPORT handler { detail::code_location MCodeLoc = {}; bool MIsFinalized = false; event MLastEvent; + property_list MPropList; // Make queue_impl class friend to be able to call finalize method. friend class detail::queue_impl; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 2e0e28a3538f7..523a307c423da 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1641,8 +1641,8 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { pi_result ExecCGCommand::SetKernelParamsAndLaunch( CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, - std::vector &RawEvents, RT::PiEvent &Event, - ProgramManager::KernelArgMask EliminatedArgMask) { + const property_list &PropList, std::vector &RawEvents, + RT::PiEvent &Event, ProgramManager::KernelArgMask EliminatedArgMask) { vector_class &Args = ExecKernel->MArgs; // TODO this is not necessary as long as we can guarantee that the arguments // are already sorted (e. g. handle the sorting in handler if necessary due @@ -1884,6 +1884,7 @@ cl_int ExecCGCommand::enqueueImp() { CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); NDRDescT &NDRDesc = ExecKernel->MNDRDesc; + const property_list &PropList = ExecKernel->MPropList; if (MQueue->is_host()) { for (ArgDesc &Arg : ExecKernel->MArgs) @@ -1950,11 +1951,11 @@ cl_int ExecCGCommand::enqueueImp() { if (KernelMutex != nullptr) { // For cacheable kernels, we use per-kernel mutex std::lock_guard Lock(*KernelMutex); - Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, - Event, EliminatedArgMask); + Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, PropList, + RawEvents, Event, EliminatedArgMask); } else { - Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, - Event, EliminatedArgMask); + Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, PropList, + RawEvents, Event, EliminatedArgMask); } if (PI_SUCCESS != Error) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8713096be9127..e63664e66512c 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -520,8 +520,8 @@ class ExecCGCommand : public Command { pi_result SetKernelParamsAndLaunch( CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, - std::vector &RawEvents, RT::PiEvent &Event, - ProgramManager::KernelArgMask EliminatedArgMask); + const property_list &, std::vector &RawEvents, + RT::PiEvent &Event, ProgramManager::KernelArgMask EliminatedArgMask); std::unique_ptr MCommandGroup; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 5e63addf8e446..77231bf6dfb7b 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -33,8 +33,8 @@ event handler::finalize() { case detail::CG::KERNEL: case detail::CG::RUN_ON_HOST_INTEL: { CommandGroup.reset(new detail::CGExecKernel( - std::move(MNDRDesc), std::move(MHostKernel), std::move(MKernel), - std::move(MArgsStorage), std::move(MAccStorage), + std::move(MNDRDesc), std::move(MPropList), std::move(MHostKernel), + std::move(MKernel), std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), std::move(MEvents), std::move(MArgs), std::move(MKernelName), std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType, diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp index 9ba991356e6ad..2f623607363a9 100644 --- a/sycl/test/abi/layout_handler.cpp +++ b/sycl/test/abi/layout_handler.cpp @@ -186,5 +186,19 @@ void foo() { // CHECK-NEXT: 544 | std::__shared_ptr::element_type * _M_ptr // CHECK-NEXT: 552 | class std::__shared_count<__gnu_cxx::_S_atomic> _M_refcount // CHECK-NEXT: 552 | _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi -// CHECK-NEXT: | [sizeof=560, dsize=560, align=8, -// CHECK-NEXT: | nvsize=560, nvalign=8] +// CHECK-NEXT: 560 | class sycl::property_list MPropList +// CHECK-NEXT: 560 | class sycl::detail::PropertyListBase (base) +// CHECK-NEXT: 560 | class std::bitset<7> MDataLessProps +// CHECK-NEXT: 560 | struct std::_Base_bitset<1> (base) +// CHECK-NEXT: 560 | std::_Base_bitset<1>::_WordT _M_w +// CHECK-NEXT: 568 | class std::vector > MPropsWithData +// CHECK-NEXT: 568 | struct std::_Vector_base, class std::allocator > > (base) +// CHECK-NEXT: 568 | struct std::_Vector_base, class std::allocator > >::_Vector_impl _M_impl +// CHECK-NEXT: 568 | class std::allocator > (base) (empty) +// CHECK-NEXT: 568 | class __gnu_cxx::new_allocator > (base) (empty) +// CHECK-NEXT: 568 | struct std::_Vector_base, class std::allocator > >::_Vector_impl_data (base) +// CHECK-NEXT: 568 | std::_Vector_base, class std::allocator > >::pointer _M_start +// CHECK-NEXT: 576 | std::_Vector_base, class std::allocator > >::pointer _M_finish +// CHECK-NEXT: 584 | std::_Vector_base, class std::allocator > >::pointer _M_end_of_storage +// CHECK-NEXT: | [sizeof=592, dsize=592, align=8, +// CHECK-NEXT: | nvsize=592, nvalign=8] diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp index 4ff2cdac83d8e..81fd10efde62a 100644 --- a/sycl/test/abi/symbol_size_alignment.cpp +++ b/sycl/test/abi/symbol_size_alignment.cpp @@ -62,7 +62,7 @@ int main() { check(); check, 272, 8>(); #else - check(); + check(); check(); check, 240, 8>(); #endif diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 01d9e8a20bd6f..d8568fda7b7df 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -33,13 +33,13 @@ class MockHandler : public sycl::handler { case detail::CG::KERNEL: case detail::CG::RUN_ON_HOST_INTEL: { CommandGroup.reset(new detail::CGExecKernel( - std::move(CGH->MNDRDesc), std::move(CGH->MHostKernel), - std::move(CGH->MKernel), std::move(CGH->MArgsStorage), - std::move(CGH->MAccStorage), std::move(CGH->MSharedPtrStorage), - std::move(CGH->MRequirements), std::move(CGH->MEvents), - std::move(CGH->MArgs), std::move(CGH->MKernelName), - std::move(CGH->MOSModuleHandle), std::move(CGH->MStreamStorage), - CGH->MCGType, CGH->MCodeLoc)); + std::move(CGH->MNDRDesc), std::move(CGH->MPropList), + std::move(CGH->MHostKernel), std::move(CGH->MKernel), + std::move(CGH->MArgsStorage), std::move(CGH->MAccStorage), + std::move(CGH->MSharedPtrStorage), std::move(CGH->MRequirements), + std::move(CGH->MEvents), std::move(CGH->MArgs), + std::move(CGH->MKernelName), std::move(CGH->MOSModuleHandle), + std::move(CGH->MStreamStorage), CGH->MCGType, CGH->MCodeLoc)); break; } default: