From 6cc026b97ca5ba0396b3a6e06706840260a90f3a Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Thu, 25 Mar 2021 11:11:36 -0700
Subject: [PATCH] [SYCL] Runtime property lists on kernel invocation functions

Add an optional parameter to kernel invocation functions which accepts
property list. Such properties are going to affect the way the kernel
is invoked. Example is a property to set cache/SLM size for a kernel
invocation which is going to be implemented in upcoming PR.
---
 sycl/CMakeLists.txt                           |   2 +-
 sycl/include/CL/sycl/ONEAPI/reduction.hpp     |  23 +++
 sycl/include/CL/sycl/detail/cg.hpp            |  12 +-
 sycl/include/CL/sycl/handler.hpp              | 189 +++++++++++++-----
 sycl/source/detail/scheduler/commands.cpp     |  13 +-
 sycl/source/detail/scheduler/commands.hpp     |   4 +-
 sycl/source/handler.cpp                       |   4 +-
 sycl/test/abi/layout_handler.cpp              |  18 +-
 sycl/test/abi/symbol_size_alignment.cpp       |   2 +-
 .../scheduler/StreamInitDependencyOnHost.cpp  |  14 +-
 10 files changed, 206 insertions(+), 75 deletions(-)
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index 53bec90a35bb6..48044a804eb4e 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -15,7 +15,7 @@ include(AddSYCLExecutable)
 set(SYCL_MAJOR_VERSION 5)
 set(SYCL_MINOR_VERSION 1)
 set(SYCL_PATCH_VERSION 0)
-set(SYCL_DEV_ABI_VERSION 0)
+set(SYCL_DEV_ABI_VERSION 1)
 if (SYCL_ADD_DEV_VERSION_POSTFIX)
   set(SYCL_VERSION_POSTFIX "-${SYCL_DEV_ABI_VERSION}")
 endif()
diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index 3d3078ac95bc9..a73ea6ab687f7 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -432,6 +432,29 @@ template <typename T> struct AreAllButLastReductions<T> {
   static constexpr bool value = !std::is_base_of<reduction_impl_base, T>::value;
 };
 
+/// Predicate returning true if all template type parameters except the last two
+/// are reductions.
+template <typename FirstT, typename... RestT>
+struct AreAllButLastTwoReductions {
+  static constexpr bool value =
+      std::is_base_of<reduction_impl_base, FirstT>::value &&
+      AreAllButLastTwoReductions<RestT...>::value;
+};
+
+/// Helper specialization of AreAllButLastTwoReductions for two elements.
+/// Returns true if the template parameters are not a reduction.
+template <typename T1, typename T2> struct AreAllButLastTwoReductions<T1, T2> {
+  static constexpr bool value =
+      !std::is_base_of<reduction_impl_base, T1>::value &&
+      !std::is_base_of<reduction_impl_base, T2>::value;
+};
+
+/// Helper specialization of AreAllButLastTwoReductions for one element only.
+/// Returns true if the template parameter is not a reduction.
+template <typename T> struct AreAllButLastTwoReductions<T> {
+  static constexpr bool value = !std::is_base_of<reduction_impl_base, T>::value;
+};
+
 /// This class encapsulates the reduction variable/accessor,
 /// the reduction operator and an optional operator identity.
 template <typename T, class BinaryOperation, int Dims, bool IsUSM,
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index 420ec280cffeb..0fdb8014fda7e 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -121,6 +121,7 @@ class CGExecKernel : public CG {
 public:
   /// Stores ND-range description.
   NDRDescT MNDRDesc;
+  property_list MPropList;
   unique_ptr_class<HostKernelBase> MHostKernel;
   shared_ptr_class<detail::kernel_impl> MSyclKernel;
   vector_class<ArgDesc> MArgs;
@@ -128,7 +129,8 @@ class CGExecKernel : public CG {
   detail::OSModuleHandle MOSModuleHandle;
   vector_class<shared_ptr_class<detail::stream_impl>> MStreams;
 
-  CGExecKernel(NDRDescT NDRDesc, unique_ptr_class<HostKernelBase> HKernel,
+  CGExecKernel(NDRDescT NDRDesc, const property_list &PropList,
+               unique_ptr_class<HostKernelBase> HKernel,
                shared_ptr_class<detail::kernel_impl> SyclKernel,
                vector_class<vector_class<char>> ArgsStorage,
                vector_class<detail::AccessorImplPtr> AccStorage,
@@ -142,10 +144,10 @@ class CGExecKernel : public CG {
       : CG(Type, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
            std::move(Events), std::move(loc)),
-        MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
-        MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)),
-        MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle),
-        MStreams(std::move(Streams)) {
+        MNDRDesc(std::move(NDRDesc)), MPropList(PropList),
+        MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)),
+        MArgs(std::move(Args)), MKernelName(std::move(KernelName)),
+        MOSModuleHandle(OSModuleHandle), MStreams(std::move(Streams)) {
     assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) &&
            "Wrong type of exec kernel CG.");
   }
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index ef60964e3d332..5f1d10aa17f5e 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -270,6 +270,7 @@ std::tuple<std::tuple_element_t<Is, TupleT>...>
 tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>);
 
 template <typename FirstT, typename... RestT> struct AreAllButLastReductions;
+template <typename FirstT, typename... RestT> struct AreAllButLastTwoReductions;
 
 } // namespace detail
 } // namespace ONEAPI
@@ -716,8 +717,8 @@ class __SYCL_EXPORT handler {
   /// \param NumWorkItems is a range defining indexing space.
   /// \param KernelFunc is a SYCL kernel function.
   template <typename KernelName, typename KernelType, int Dims>
-  void parallel_for_lambda_impl(range<Dims> NumWorkItems,
-                                KernelType KernelFunc) {
+  void parallel_for_lambda_impl(range<Dims> NumWorkItems, KernelType KernelFunc,
+                                const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using LambdaArgType = sycl::detail::lambda_arg_type<KernelType, item<Dims>>;
 
@@ -813,10 +814,12 @@ class __SYCL_EXPORT handler {
       range<Dims> AdjustedRange = NumWorkItems;
       AdjustedRange.set_range_dim0(NewValX);
 #ifdef __SYCL_DEVICE_ONLY__
+      (void)PropList;
       kernel_parallel_for<NameWT, TransformedArgType>(Wrapper);
 #else
       detail::checkValueRange<Dims>(AdjustedRange);
       MNDRDesc.set(std::move(AdjustedRange));
+      MPropList = PropList;
       StoreLambda<NameWT, decltype(Wrapper), Dims, TransformedArgType>(
           std::move(Wrapper));
       MCGType = detail::CG::KERNEL;
@@ -828,10 +831,12 @@ class __SYCL_EXPORT handler {
     {
 #ifdef __SYCL_DEVICE_ONLY__
       (void)NumWorkItems;
+      (void)PropList;
       kernel_parallel_for<NameT, TransformedArgType>(KernelFunc);
 #else
       detail::checkValueRange<Dims>(NumWorkItems);
       MNDRDesc.set(std::move(NumWorkItems));
+      MPropList = PropList;
       StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
           std::move(KernelFunc));
       MCGType = detail::CG::KERNEL;
@@ -847,12 +852,14 @@ class __SYCL_EXPORT handler {
   /// \param NumWorkItems is a range defining indexing space.
   /// \param Kernel is a SYCL kernel function.
   template <int Dims>
-  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel) {
+  void parallel_for_impl(range<Dims> NumWorkItems, kernel Kernel,
+                         const property_list &PropList = {}) {
     throwIfActionIsCreated();
     verifyKernelInvoc(Kernel);
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     detail::checkValueRange<Dims>(NumWorkItems);
     MNDRDesc.set(std::move(NumWorkItems));
+    MPropList = PropList;
     MCGType = detail::CG::KERNEL;
     extractArgsAndReqs();
     MKernelName = getKernelName();
@@ -996,19 +1003,22 @@ class __SYCL_EXPORT handler {
   /// \param KernelFunc is a SYCL kernel function.
   template <typename KernelName = detail::auto_name, typename KernelType>
 #ifdef __SYCL_NONCONST_FUNCTOR__
-  void single_task(KernelType KernelFunc) {
+  void single_task(KernelType KernelFunc, const property_list &PropList = {}) {
 #else
-  void single_task(const KernelType &KernelFunc) {
+  void single_task(const KernelType &KernelFunc,
+                   const property_list &PropList = {}) {
 #endif
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
+    (void)PropList;
     kernel_single_task<NameT>(KernelFunc);
 #else
     // No need to check if range is out of INT_MAX limits as it's compile-time
     // known constant.
     MNDRDesc.set(range<1>{1});
+    MPropList = PropList;
 
     StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(KernelFunc);
     MCGType = detail::CG::KERNEL;
@@ -1017,29 +1027,38 @@ class __SYCL_EXPORT handler {
 
   template <typename KernelName = detail::auto_name, typename KernelType>
 #ifdef __SYCL_NONCONST_FUNCTOR__
-  void parallel_for(range<1> NumWorkItems, KernelType KernelFunc) {
+  void parallel_for(range<1> NumWorkItems, KernelType KernelFunc,
+                    const property_list &PropList = {}) {
 #else
-  void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc) {
+  void parallel_for(range<1> NumWorkItems, const KernelType &KernelFunc,
+                    const property_list &PropList = {}) {
 #endif
-    parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
+    parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc),
+                                         PropList);
   }
 
   template <typename KernelName = detail::auto_name, typename KernelType>
 #ifdef __SYCL_NONCONST_FUNCTOR__
-  void parallel_for(range<2> NumWorkItems, KernelType KernelFunc) {
+  void parallel_for(range<2> NumWorkItems, KernelType KernelFunc,
+                    const property_list &PropList = {}) {
 #else
-  void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc) {
+  void parallel_for(range<2> NumWorkItems, const KernelType &KernelFunc,
+                    const property_list &PropList = {}) {
 #endif
-    parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
+    parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc),
+                                         PropList);
   }
 
   template <typename KernelName = detail::auto_name, typename KernelType>
 #ifdef __SYCL_NONCONST_FUNCTOR__
-  void parallel_for(range<3> NumWorkItems, KernelType KernelFunc) {
+  void parallel_for(range<3> NumWorkItems, KernelType KernelFunc,
+                    const property_list &PropList = {}) {
 #else
-  void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc) {
+  void parallel_for(range<3> NumWorkItems, const KernelType &KernelFunc,
+                    const property_list &PropList = {}) {
 #endif
-    parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc));
+    parallel_for_lambda_impl<KernelName>(NumWorkItems, std::move(KernelFunc),
+                                         PropList);
   }
 
   /// Defines and invokes a SYCL kernel on host device.
@@ -1110,7 +1129,8 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
   void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
-                    _KERNELFUNCPARAM(KernelFunc)) {
+                    _KERNELFUNCPARAM(KernelFunc),
+                    const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1118,8 +1138,10 @@ class __SYCL_EXPORT handler {
 #ifdef __SYCL_DEVICE_ONLY__
     (void)NumWorkItems;
     (void)WorkItemOffset;
+    (void)PropList;
     kernel_parallel_for<NameT, LambdaArgType>(KernelFunc);
 #else
+    MPropList = PropList;
     detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
     MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
     StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
@@ -1141,8 +1163,8 @@ class __SYCL_EXPORT handler {
   /// \param KernelFunc is a SYCL kernel function.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  void parallel_for(nd_range<Dims> ExecutionRange,
-                    _KERNELFUNCPARAM(KernelFunc)) {
+  void parallel_for(nd_range<Dims> ExecutionRange, _KERNELFUNCPARAM(KernelFunc),
+                    const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1150,8 +1172,10 @@ class __SYCL_EXPORT handler {
         sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
 #ifdef __SYCL_DEVICE_ONLY__
     (void)ExecutionRange;
+    (void)PropList;
     kernel_parallel_for<NameT, LambdaArgType>(KernelFunc);
 #else
+    MPropList = PropList;
     detail::checkValueRange<Dims>(ExecutionRange);
     MNDRDesc.set(std::move(ExecutionRange));
     StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
@@ -1168,7 +1192,9 @@ class __SYCL_EXPORT handler {
   detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
                       Reduction::has_fast_atomics && !Reduction::is_usm>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
+               _KERNELFUNCPARAM(KernelFunc),
+               const property_list &PropList = {}) {
+    MPropList = PropList;
     ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
                                            Redu.getUserAccessor());
   }
@@ -1182,7 +1208,9 @@ class __SYCL_EXPORT handler {
   detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
                       Reduction::has_fast_atomics && Reduction::is_usm>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
+               _KERNELFUNCPARAM(KernelFunc),
+               const property_list &PropList = {}) {
+    MPropList = PropList;
     ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
                                            Redu.getUSMPointer());
   }
@@ -1202,7 +1230,9 @@ class __SYCL_EXPORT handler {
   detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
                       Reduction::has_fast_atomics>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
+               _KERNELFUNCPARAM(KernelFunc),
+               const property_list &PropList = {}) {
+    MPropList = PropList;
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
     auto RWAcc = Redu.getReadWriteScalarAcc(*this);
     ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
@@ -1238,7 +1268,8 @@ class __SYCL_EXPORT handler {
             int Dims, typename Reduction>
   detail::enable_if_t<!Reduction::has_fast_atomics>
   parallel_for(nd_range<Dims> Range, Reduction Redu,
-               _KERNELFUNCPARAM(KernelFunc)) {
+               _KERNELFUNCPARAM(KernelFunc),
+               const property_list &PropList = {}) {
     // This parallel_for() is lowered to the following sequence:
     // 1) Call a kernel that a) call user's lambda function and b) performs
     //    one iteration of reduction, storing the partial reductions/sums
@@ -1260,6 +1291,7 @@ class __SYCL_EXPORT handler {
     // the main kernel, but simply generate Range.get_global_range.size() number
     // of partial sums, leaving the reduction work to the additional/aux
     // kernels.
+    MPropList = PropList;
     constexpr bool HFR = Reduction::has_fast_reduce;
     size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
     // TODO: currently the maximal work group size is determined for the given
@@ -1302,20 +1334,16 @@ class __SYCL_EXPORT handler {
     } // end while (NWorkItems > 1)
   }
 
-  // This version of parallel_for may handle one or more reductions packed in
-  // \p Rest argument. Note thought that the last element in \p Rest pack is
-  // the kernel function.
-  // TODO: this variant is currently enabled for 2+ reductions only as the
-  // versions handling 1 reduction variable are more efficient right now.
-  template <typename KernelName = detail::auto_name, int Dims,
+  // Helper template to handle parallel_for with 2+ reductions which can
+  // optionally have property_list as the last argument.
+  template <typename KernelName = detail::auto_name, int Dims, bool hasPropList,
             typename... RestT>
-  std::enable_if_t<(sizeof...(RestT) >= 3 &&
-                    ONEAPI::detail::AreAllButLastReductions<RestT...>::value)>
-  parallel_for(nd_range<Dims> Range, RestT... Rest) {
+  void parallel_for_impl(nd_range<Dims> Range, RestT... Rest) {
     std::tuple<RestT...> ArgsTuple(Rest...);
     constexpr size_t NumArgs = sizeof...(RestT);
-    auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
-    auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
+    constexpr size_t Offset = hasPropList ? 2 : 1;
+    auto KernelFunc = std::get<NumArgs - Offset>(ArgsTuple);
+    auto ReduIndices = std::make_index_sequence<NumArgs - Offset>();
     auto ReduTuple =
         ONEAPI::detail::tuple_select_elements(ArgsTuple, ReduIndices);
 
@@ -1350,6 +1378,36 @@ class __SYCL_EXPORT handler {
     } // end while (NWorkItems > 1)
   }
 
+  // This version of parallel_for may handle one or more reductions packed in
+  // \p Rest argument. Note though that the last element in \p Rest pack is
+  // the kernel function.
+  // TODO: this variant is currently enabled for 2+ reductions only as the
+  // versions handling 1 reduction variable are more efficient right now.
+  template <typename KernelName = detail::auto_name, int Dims,
+            typename... RestT>
+  std::enable_if_t<(sizeof...(RestT) >= 3 &&
+                    ONEAPI::detail::AreAllButLastReductions<RestT...>::value)>
+  parallel_for(nd_range<Dims> Range, RestT... Rest) {
+    parallel_for_impl<KernelName, Dims, false, RestT...>(Range, Rest...);
+  }
+
+  // This version of parallel_for may handle one or more reductions packed in
+  // \p Rest argument. Note though that the last two elements in \p Rest pack
+  // are the kernel function and the property list.
+  // TODO: this variant is currently enabled for 2+ reductions only as the
+  // versions handling 1 reduction variable are more efficient right now.
+  template <typename KernelName = detail::auto_name, int Dims,
+            typename... RestT>
+  std::enable_if_t<
+      (sizeof...(RestT) >= 4 &&
+       ONEAPI::detail::AreAllButLastTwoReductions<RestT...>::value)>
+  parallel_for(nd_range<Dims> Range, RestT... Rest) {
+    std::tuple<RestT...> ArgsTuple(Rest...);
+    constexpr size_t NumArgs = sizeof...(RestT);
+    MPropList = std::get<NumArgs - 1>(ArgsTuple);
+    parallel_for_impl<KernelName, Dims, true, RestT...>(Range, Rest...);
+  }
+
   /// Hierarchical kernel invocation method of a kernel defined as a lambda
   /// encoding the body of each work-group to launch.
   ///
@@ -1363,7 +1421,8 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
   void parallel_for_work_group(range<Dims> NumWorkGroups,
-                               _KERNELFUNCPARAM(KernelFunc)) {
+                               _KERNELFUNCPARAM(KernelFunc),
+                               const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1371,10 +1430,12 @@ class __SYCL_EXPORT handler {
         sycl::detail::lambda_arg_type<KernelType, group<Dims>>;
 #ifdef __SYCL_DEVICE_ONLY__
     (void)NumWorkGroups;
+    (void)PropList;
     kernel_parallel_for_work_group<NameT, LambdaArgType>(KernelFunc);
 #else
     detail::checkValueRange<Dims>(NumWorkGroups);
     MNDRDesc.setNumWorkGroups(NumWorkGroups);
+    MPropList = PropList;
     StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
     MCGType = detail::CG::KERNEL;
 #endif // __SYCL_DEVICE_ONLY__
@@ -1396,7 +1457,8 @@ class __SYCL_EXPORT handler {
             int Dims>
   void parallel_for_work_group(range<Dims> NumWorkGroups,
                                range<Dims> WorkGroupSize,
-                               _KERNELFUNCPARAM(KernelFunc)) {
+                               _KERNELFUNCPARAM(KernelFunc),
+                               const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1405,12 +1467,14 @@ class __SYCL_EXPORT handler {
 #ifdef __SYCL_DEVICE_ONLY__
     (void)NumWorkGroups;
     (void)WorkGroupSize;
+    (void)PropList;
     kernel_parallel_for_work_group<NameT, LambdaArgType>(KernelFunc);
 #else
     nd_range<Dims> ExecRange =
         nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
     detail::checkValueRange<Dims>(ExecRange);
     MNDRDesc.set(std::move(ExecRange));
+    MPropList = PropList;
     StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
     MCGType = detail::CG::KERNEL;
 #endif // __SYCL_DEVICE_ONLY__
@@ -1422,28 +1486,32 @@ class __SYCL_EXPORT handler {
   /// cannot be called on host.
   ///
   /// \param Kernel is a SYCL kernel object.
-  void single_task(kernel Kernel) {
+  void single_task(kernel Kernel, const property_list &PropList = {}) {
     throwIfActionIsCreated();
     verifyKernelInvoc(Kernel);
     // No need to check if range is out of INT_MAX limits as it's compile-time
     // known constant
     MNDRDesc.set(range<1>{1});
+    MPropList = PropList;
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     MCGType = detail::CG::KERNEL;
     extractArgsAndReqs();
     MKernelName = getKernelName();
   }
 
-  void parallel_for(range<1> NumWorkItems, kernel Kernel) {
-    parallel_for_impl(NumWorkItems, Kernel);
+  void parallel_for(range<1> NumWorkItems, kernel Kernel,
+                    const property_list &PropList = {}) {
+    parallel_for_impl(NumWorkItems, Kernel, PropList);
   }
 
-  void parallel_for(range<2> NumWorkItems, kernel Kernel) {
-    parallel_for_impl(NumWorkItems, Kernel);
+  void parallel_for(range<2> NumWorkItems, kernel Kernel,
+                    const property_list &PropList = {}) {
+    parallel_for_impl(NumWorkItems, Kernel, PropList);
   }
 
-  void parallel_for(range<3> NumWorkItems, kernel Kernel) {
-    parallel_for_impl(NumWorkItems, Kernel);
+  void parallel_for(range<3> NumWorkItems, kernel Kernel,
+                    const property_list &PropList = {}) {
+    parallel_for_impl(NumWorkItems, Kernel, PropList);
   }
 
   /// Defines and invokes a SYCL kernel function for the specified range and
@@ -1456,12 +1524,13 @@ class __SYCL_EXPORT handler {
   /// \param Kernel is a SYCL kernel function.
   template <int Dims>
   void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
-                    kernel Kernel) {
+                    kernel Kernel, const property_list &PropList = {}) {
     throwIfActionIsCreated();
     verifyKernelInvoc(Kernel);
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
     MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
+    MPropList = PropList;
     MCGType = detail::CG::KERNEL;
     extractArgsAndReqs();
     MKernelName = getKernelName();
@@ -1475,12 +1544,15 @@ class __SYCL_EXPORT handler {
   /// \param NDRange is a ND-range defining global and local sizes as
   /// well as offset.
   /// \param Kernel is a SYCL kernel function.
-  template <int Dims> void parallel_for(nd_range<Dims> NDRange, kernel Kernel) {
+  template <int Dims>
+  void parallel_for(nd_range<Dims> NDRange, kernel Kernel,
+                    const property_list &PropList = {}) {
     throwIfActionIsCreated();
     verifyKernelInvoc(Kernel);
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     detail::checkValueRange<Dims>(NDRange);
     MNDRDesc.set(std::move(NDRange));
+    MPropList = PropList;
     MCGType = detail::CG::KERNEL;
     extractArgsAndReqs();
     MKernelName = getKernelName();
@@ -1493,17 +1565,20 @@ class __SYCL_EXPORT handler {
   /// \param KernelFunc is a lambda that is used if device, queue is bound to,
   /// is a host device.
   template <typename KernelName = detail::auto_name, typename KernelType>
-  void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc)) {
+  void single_task(kernel Kernel, _KERNELFUNCPARAM(KernelFunc),
+                   const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
 #ifdef __SYCL_DEVICE_ONLY__
     (void)Kernel;
+    (void)PropList;
     kernel_single_task<NameT>(KernelFunc);
 #else
     // No need to check if range is out of INT_MAX limits as it's compile-time
     // known constant
     MNDRDesc.set(range<1>{1});
+    MPropList = PropList;
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     MCGType = detail::CG::KERNEL;
     if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
@@ -1533,7 +1608,8 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
   void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
-                    _KERNELFUNCPARAM(KernelFunc)) {
+                    _KERNELFUNCPARAM(KernelFunc),
+                    const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1541,10 +1617,12 @@ class __SYCL_EXPORT handler {
 #ifdef __SYCL_DEVICE_ONLY__
     (void)Kernel;
     (void)NumWorkItems;
+    (void)PropList;
     kernel_parallel_for<NameT, LambdaArgType>(KernelFunc);
 #else
     detail::checkValueRange<Dims>(NumWorkItems);
     MNDRDesc.set(std::move(NumWorkItems));
+    MPropList = PropList;
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     MCGType = detail::CG::KERNEL;
     if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
@@ -1568,7 +1646,8 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
   void parallel_for(kernel Kernel, range<Dims> NumWorkItems,
-                    id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc)) {
+                    id<Dims> WorkItemOffset, _KERNELFUNCPARAM(KernelFunc),
+                    const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1577,10 +1656,12 @@ class __SYCL_EXPORT handler {
     (void)Kernel;
     (void)NumWorkItems;
     (void)WorkItemOffset;
+    (void)PropList;
     kernel_parallel_for<NameT, LambdaArgType>(KernelFunc);
 #else
     detail::checkValueRange<Dims>(NumWorkItems, WorkItemOffset);
     MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
+    MPropList = PropList;
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     MCGType = detail::CG::KERNEL;
     if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
@@ -1604,7 +1685,8 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
   void parallel_for(kernel Kernel, nd_range<Dims> NDRange,
-                    _KERNELFUNCPARAM(KernelFunc)) {
+                    _KERNELFUNCPARAM(KernelFunc),
+                    const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1613,10 +1695,12 @@ class __SYCL_EXPORT handler {
 #ifdef __SYCL_DEVICE_ONLY__
     (void)Kernel;
     (void)NDRange;
+    (void)PropList;
     kernel_parallel_for<NameT, LambdaArgType>(KernelFunc);
 #else
     detail::checkValueRange<Dims>(NDRange);
     MNDRDesc.set(std::move(NDRange));
+    MPropList = PropList;
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     MCGType = detail::CG::KERNEL;
     if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
@@ -1644,7 +1728,8 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
   void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
-                               _KERNELFUNCPARAM(KernelFunc)) {
+                               _KERNELFUNCPARAM(KernelFunc),
+                               const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1653,10 +1738,12 @@ class __SYCL_EXPORT handler {
 #ifdef __SYCL_DEVICE_ONLY__
     (void)Kernel;
     (void)NumWorkGroups;
+    (void)PropList;
     kernel_parallel_for_work_group<NameT, LambdaArgType>(KernelFunc);
 #else
     detail::checkValueRange<Dims>(NumWorkGroups);
     MNDRDesc.setNumWorkGroups(NumWorkGroups);
+    MPropList = PropList;
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
     MCGType = detail::CG::KERNEL;
@@ -1682,7 +1769,8 @@ class __SYCL_EXPORT handler {
             int Dims>
   void parallel_for_work_group(kernel Kernel, range<Dims> NumWorkGroups,
                                range<Dims> WorkGroupSize,
-                               _KERNELFUNCPARAM(KernelFunc)) {
+                               _KERNELFUNCPARAM(KernelFunc),
+                               const property_list &PropList = {}) {
     throwIfActionIsCreated();
     using NameT =
         typename detail::get_kernel_name_t<KernelName, KernelType>::name;
@@ -1692,12 +1780,14 @@ class __SYCL_EXPORT handler {
     (void)Kernel;
     (void)NumWorkGroups;
     (void)WorkGroupSize;
+    (void)PropList;
     kernel_parallel_for_work_group<NameT, LambdaArgType>(KernelFunc);
 #else
     nd_range<Dims> ExecRange =
         nd_range<Dims>(NumWorkGroups * WorkGroupSize, WorkGroupSize);
     detail::checkValueRange<Dims>(ExecRange);
     MNDRDesc.set(std::move(ExecRange));
+    MPropList = PropList;
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     StoreLambda<NameT, KernelType, Dims, LambdaArgType>(std::move(KernelFunc));
     MCGType = detail::CG::KERNEL;
@@ -2066,6 +2156,7 @@ class __SYCL_EXPORT handler {
   detail::code_location MCodeLoc = {};
   bool MIsFinalized = false;
   event MLastEvent;
+  property_list MPropList;
 
   // Make queue_impl class friend to be able to call finalize method.
   friend class detail::queue_impl;
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 2e0e28a3538f7..523a307c423da 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -1641,8 +1641,8 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
 
 pi_result ExecCGCommand::SetKernelParamsAndLaunch(
     CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc,
-    std::vector<RT::PiEvent> &RawEvents, RT::PiEvent &Event,
-    ProgramManager::KernelArgMask EliminatedArgMask) {
+    const property_list &PropList, std::vector<RT::PiEvent> &RawEvents,
+    RT::PiEvent &Event, ProgramManager::KernelArgMask EliminatedArgMask) {
   vector_class<ArgDesc> &Args = ExecKernel->MArgs;
   // TODO this is not necessary as long as we can guarantee that the arguments
   // are already sorted (e. g. handle the sorting in handler if necessary due
@@ -1884,6 +1884,7 @@ cl_int ExecCGCommand::enqueueImp() {
     CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
 
     NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
+    const property_list &PropList = ExecKernel->MPropList;
 
     if (MQueue->is_host()) {
       for (ArgDesc &Arg : ExecKernel->MArgs)
@@ -1950,11 +1951,11 @@ cl_int ExecCGCommand::enqueueImp() {
     if (KernelMutex != nullptr) {
       // For cacheable kernels, we use per-kernel mutex
       std::lock_guard<std::mutex> Lock(*KernelMutex);
-      Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents,
-                                       Event, EliminatedArgMask);
+      Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, PropList,
+                                       RawEvents, Event, EliminatedArgMask);
     } else {
-      Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents,
-                                       Event, EliminatedArgMask);
+      Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, PropList,
+                                       RawEvents, Event, EliminatedArgMask);
     }
 
     if (PI_SUCCESS != Error) {
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 8713096be9127..e63664e66512c 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -520,8 +520,8 @@ class ExecCGCommand : public Command {
 
   pi_result SetKernelParamsAndLaunch(
       CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc,
-      std::vector<RT::PiEvent> &RawEvents, RT::PiEvent &Event,
-      ProgramManager::KernelArgMask EliminatedArgMask);
+      const property_list &, std::vector<RT::PiEvent> &RawEvents,
+      RT::PiEvent &Event, ProgramManager::KernelArgMask EliminatedArgMask);
 
   std::unique_ptr<detail::CG> MCommandGroup;
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 5e63addf8e446..77231bf6dfb7b 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -33,8 +33,8 @@ event handler::finalize() {
   case detail::CG::KERNEL:
   case detail::CG::RUN_ON_HOST_INTEL: {
     CommandGroup.reset(new detail::CGExecKernel(
-        std::move(MNDRDesc), std::move(MHostKernel), std::move(MKernel),
-        std::move(MArgsStorage), std::move(MAccStorage),
+        std::move(MNDRDesc), std::move(MPropList), std::move(MHostKernel),
+        std::move(MKernel), std::move(MArgsStorage), std::move(MAccStorage),
         std::move(MSharedPtrStorage), std::move(MRequirements),
         std::move(MEvents), std::move(MArgs), std::move(MKernelName),
         std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType,
diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp
index 9ba991356e6ad..2f623607363a9 100644
--- a/sycl/test/abi/layout_handler.cpp
+++ b/sycl/test/abi/layout_handler.cpp
@@ -186,5 +186,19 @@ void foo() {
 // CHECK-NEXT: 544 |         std::__shared_ptr<class sycl::detail::event_impl, __gnu_cxx::_S_atomic>::element_type * _M_ptr
 // CHECK-NEXT: 552 |         class std::__shared_count<__gnu_cxx::_S_atomic> _M_refcount
 // CHECK-NEXT: 552 |           _Sp_counted_base<(enum __gnu_cxx::_Lock_policy)2U> * _M_pi
-// CHECK-NEXT:     | [sizeof=560, dsize=560, align=8,
-// CHECK-NEXT:     |  nvsize=560, nvalign=8]
+// CHECK-NEXT: 560 |   class sycl::property_list MPropList
+// CHECK-NEXT: 560 |     class sycl::detail::PropertyListBase (base)
+// CHECK-NEXT: 560 |       class std::bitset<7> MDataLessProps
+// CHECK-NEXT: 560 |         struct std::_Base_bitset<1> (base)
+// CHECK-NEXT: 560 |           std::_Base_bitset<1>::_WordT _M_w
+// CHECK-NEXT: 568 |       class std::vector<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > MPropsWithData
+// CHECK-NEXT: 568 |         struct std::_Vector_base<class std::shared_ptr<class sycl::detail::PropertyWithDataBase>, class std::allocator<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > > (base)
+// CHECK-NEXT: 568 |           struct std::_Vector_base<class std::shared_ptr<class sycl::detail::PropertyWithDataBase>, class std::allocator<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > >::_Vector_impl _M_impl
+// CHECK-NEXT: 568 |             class std::allocator<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > (base) (empty)
+// CHECK-NEXT: 568 |               class __gnu_cxx::new_allocator<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > (base) (empty)
+// CHECK-NEXT: 568 |             struct std::_Vector_base<class std::shared_ptr<class sycl::detail::PropertyWithDataBase>, class std::allocator<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > >::_Vector_impl_data (base)
+// CHECK-NEXT: 568 |               std::_Vector_base<class std::shared_ptr<class sycl::detail::PropertyWithDataBase>, class std::allocator<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > >::pointer _M_start
+// CHECK-NEXT: 576 |               std::_Vector_base<class std::shared_ptr<class sycl::detail::PropertyWithDataBase>, class std::allocator<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > >::pointer _M_finish
+// CHECK-NEXT: 584 |               std::_Vector_base<class std::shared_ptr<class sycl::detail::PropertyWithDataBase>, class std::allocator<class std::shared_ptr<class sycl::detail::PropertyWithDataBase> > >::pointer _M_end_of_storage
+// CHECK-NEXT:     | [sizeof=592, dsize=592, align=8,
+// CHECK-NEXT:     |  nvsize=592, nvalign=8]
diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp
index 4ff2cdac83d8e..81fd10efde62a 100644
--- a/sycl/test/abi/symbol_size_alignment.cpp
+++ b/sycl/test/abi/symbol_size_alignment.cpp
@@ -62,7 +62,7 @@ int main() {
   check<detail::buffer_impl, 216, 8>();
   check<detail::image_impl<1>, 272, 8>();
 #else
-  check<handler, 560, 8>();
+  check<handler, 592, 8>();
   check<detail::buffer_impl, 184, 8>();
   check<detail::image_impl<1>, 240, 8>();
 #endif
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index 01d9e8a20bd6f..d8568fda7b7df 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -33,13 +33,13 @@ class MockHandler : public sycl::handler {
     case detail::CG::KERNEL:
     case detail::CG::RUN_ON_HOST_INTEL: {
       CommandGroup.reset(new detail::CGExecKernel(
-          std::move(CGH->MNDRDesc), std::move(CGH->MHostKernel),
-          std::move(CGH->MKernel), std::move(CGH->MArgsStorage),
-          std::move(CGH->MAccStorage), std::move(CGH->MSharedPtrStorage),
-          std::move(CGH->MRequirements), std::move(CGH->MEvents),
-          std::move(CGH->MArgs), std::move(CGH->MKernelName),
-          std::move(CGH->MOSModuleHandle), std::move(CGH->MStreamStorage),
-          CGH->MCGType, CGH->MCodeLoc));
+          std::move(CGH->MNDRDesc), std::move(CGH->MPropList),
+          std::move(CGH->MHostKernel), std::move(CGH->MKernel),
+          std::move(CGH->MArgsStorage), std::move(CGH->MAccStorage),
+          std::move(CGH->MSharedPtrStorage), std::move(CGH->MRequirements),
+          std::move(CGH->MEvents), std::move(CGH->MArgs),
+          std::move(CGH->MKernelName), std::move(CGH->MOSModuleHandle),
+          std::move(CGH->MStreamStorage), CGH->MCGType, CGH->MCodeLoc));
       break;
     }
     default: