From 322384284cd8b1ad71c8cbb93d6ba0f79ed65e9a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 3 Jul 2025 16:17:28 +0000 Subject: [PATCH 01/65] [SYCL] Handler-less kernel submit API --- sycl/include/sycl/queue.hpp | 89 +++++++++++++++++++++++++++++++ sycl/source/detail/queue_impl.hpp | 18 +++++++ sycl/source/queue.cpp | 21 ++++++++ 3 files changed, 128 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a0dbdf5c540e8..2898025ae40c8 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,6 +149,31 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; +using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t(*)(int); + +class __SYCL_EXPORT ExtendedSubmissionInfo : public SubmissionInfo { +public: + ExtendedSubmissionInfo() {} + + std::string_view &KernelName() { return MKernelName; } + std::unique_ptr &HostKernel() { return MHostKernel; } + const std::unique_ptr &HostKernel() const { return MHostKernel; } + int &KernelNumArgs() { return MKernelNumArgs; } + KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; } + bool &KernelIsESIMD() { return MKernelIsESIMD; } + bool &KernelHasSpecialCaptures() {return MKernelHasSpecialCaptures; } + detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; } + +private: + std::string_view MKernelName; + std::unique_ptr MHostKernel; + int MKernelNumArgs = 0; + KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; + bool MKernelIsESIMD = false; + bool MKernelHasSpecialCaptures = true; + detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; +}; + } // namespace v1 } // namespace detail @@ -3609,6 +3634,37 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } + template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; + }; + + template + void ProcessExtendedSubmitProperties(PropertiesT Props, + const KernelType &KernelFunc, + detail::v1::ExtendedSubmissionInfo &SI) const { + ProcessSubmitProperties(Props, SI); + + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename TransformUserItemType::type>; + + SI.HostKernel().reset(new detail::HostKernel( + std::forward(KernelFunc))); + SI.KernelName() = detail::getKernelName(); + SI.KernelNumArgs() = detail::getKernelNumParams(); + SI.KernelParamDescGetter() = &(detail::getKernelParamDesc); + SI.KernelIsESIMD() = detail::isKernelESIMD(); + SI.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); + SI.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + } + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. /// Not using `type_erased_cgfo_ty` on purpose. @@ -3680,6 +3736,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + event submit_with_event_impl(const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) const; + + event submit_with_event_impl(const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) const; + + event submit_with_event_impl(const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) const; + /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3763,6 +3834,24 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } + template + event submit_with_event(PropertiesT Props, + const nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::v1::ExtendedSubmissionInfo SI{}; + ProcessExtendedSubmitProperties(Props, + KernelFunc, SI); + + // TODO UseFallbackAssert + + return submit_with_event_impl(Range, SI, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } + /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index d6892011b991e..1034137916aa5 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,6 +370,24 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } + event submit_with_event(const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + return event(); + } + + event submit_with_event(const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + return event(); + } + + event submit_with_event(const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + return event(); + } + void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, const detail::code_location &Loc, diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index ed0b0e42e6e6f..bd899d871f0d8 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -312,6 +312,27 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +event queue::submit_with_event_impl( + const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); +} + +event queue::submit_with_event_impl( + const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); +} + +event queue::submit_with_event_impl( + const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); +} + void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, From fde19cab99ddc783c7a2f75328b1649f33b272ea Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 3 Jul 2025 16:22:06 +0000 Subject: [PATCH 02/65] Fix formatting --- sycl/include/sycl/queue.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 2898025ae40c8..f6ac18d1b4392 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3634,12 +3634,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } - template struct TransformUserItemType { - using type = std::conditional_t< - std::is_convertible_v, LambdaArgType>, nd_item, - std::conditional_t, LambdaArgType>, - item, LambdaArgType>>; - }; + template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; + }; template From 13424de9a98bc5e88da518415c63d0719fc0129f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 4 Jul 2025 09:17:58 +0000 Subject: [PATCH 03/65] Fix formatting --- sycl/include/sycl/queue.hpp | 74 +++++++++++++++++-------------- sycl/source/detail/queue_impl.hpp | 33 ++++++++++---- 2 files changed, 64 insertions(+), 43 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index f6ac18d1b4392..a09947c552ec8 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,7 +149,7 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t(*)(int); +using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); class __SYCL_EXPORT ExtendedSubmissionInfo : public SubmissionInfo { public: @@ -157,12 +157,18 @@ class __SYCL_EXPORT ExtendedSubmissionInfo : public SubmissionInfo { std::string_view &KernelName() { return MKernelName; } std::unique_ptr &HostKernel() { return MHostKernel; } - const std::unique_ptr &HostKernel() const { return MHostKernel; } + const std::unique_ptr &HostKernel() const { + return MHostKernel; + } int &KernelNumArgs() { return MKernelNumArgs; } - KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; } + KernelParamDescGetterFuncPtr &KernelParamDescGetter() { + return MKernelParamDescGetter; + } bool &KernelIsESIMD() { return MKernelIsESIMD; } - bool &KernelHasSpecialCaptures() {return MKernelHasSpecialCaptures; } - detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; } + bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } + detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { + return MKernelNameBasedCachePtr; + } private: std::string_view MKernelName; @@ -3635,17 +3641,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } template struct TransformUserItemType { - using type = std::conditional_t< - std::is_convertible_v, LambdaArgType>, nd_item, - std::conditional_t, LambdaArgType>, - item, LambdaArgType>>; + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; }; - template - void ProcessExtendedSubmitProperties(PropertiesT Props, - const KernelType &KernelFunc, - detail::v1::ExtendedSubmissionInfo &SI) const { + template + void ProcessExtendedSubmitProperties( + PropertiesT Props, const KernelType &KernelFunc, + detail::v1::ExtendedSubmissionInfo &SI) const { ProcessSubmitProperties(Props, SI); using NameT = @@ -3655,8 +3661,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { std::is_integral::value && Dims == 1, item, typename TransformUserItemType::type>; - SI.HostKernel().reset(new detail::HostKernel( - std::forward(KernelFunc))); + SI.HostKernel().reset( + new detail::HostKernel( + std::forward(KernelFunc))); SI.KernelName() = detail::getKernelName(); SI.KernelNumArgs() = detail::getKernelNumParams(); SI.KernelParamDescGetter() = &(detail::getKernelParamDesc); @@ -3736,20 +3743,20 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl(const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) const; + event submit_with_event_impl( + const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl(const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) const; + event submit_with_event_impl( + const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl(const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) const; + event submit_with_event_impl( + const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, @@ -3834,17 +3841,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template - event submit_with_event(PropertiesT Props, - const nd_range Range, + template + event submit_with_event(PropertiesT Props, const nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::ExtendedSubmissionInfo SI{}; - ProcessExtendedSubmitProperties(Props, - KernelFunc, SI); + ProcessExtendedSubmitProperties(Props, KernelFunc, + SI); // TODO UseFallbackAssert diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 1034137916aa5..e041ebb9d30f4 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,21 +370,36 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } - event submit_with_event(const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + event + submit_with_event(const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + (void)Range; + (void)ExtSubmitInfo; + (void)CodeLoc; + (void)IsTopCodeLoc; return event(); } - event submit_with_event(const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + event + submit_with_event(const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + (void)Range; + (void)ExtSubmitInfo; + (void)CodeLoc; + (void)IsTopCodeLoc; return event(); } - event submit_with_event(const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + event + submit_with_event(const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + (void)Range; + (void)ExtSubmitInfo; + (void)CodeLoc; + (void)IsTopCodeLoc; return event(); } From fbc789d6f3bd591b1655faa7fd6357a4e74a337a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 7 Jul 2025 12:48:33 +0000 Subject: [PATCH 04/65] Change the ExtendedSubmissionInfo to KernelRuntimeInfo, expose the new APIs as public under a new define --- .../oneapi/experimental/enqueue_functions.hpp | 23 ++++++ sycl/include/sycl/queue.hpp | 81 ++++++++++++------- sycl/source/detail/queue_impl.hpp | 24 +++--- sycl/source/queue.cpp | 24 +++--- 4 files changed, 105 insertions(+), 47 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index f599078a6769e..5e2a6ec78feb4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -110,6 +110,16 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT>( Props, detail::type_erased_cgfo_ty{CGF}, nullptr, CodeLoc); } + +template +event submit_with_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, + KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); +} } // namespace detail template @@ -144,6 +154,19 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, std::forward(CGF), CodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +event submit_with_event(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + return sycl::ext::oneapi::experimental::detail::submit_with_event_impl + (Q, Props, Range, KernelFunc, CodeLoc); +} +#endif + template void single_task(handler &CGH, const KernelType &KernelObj) { CGH.single_task(KernelObj); diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a09947c552ec8..b2a1106aa051b 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -151,15 +151,14 @@ class __SYCL_EXPORT SubmissionInfo { using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); -class __SYCL_EXPORT ExtendedSubmissionInfo : public SubmissionInfo { +// This class is intended to store the kernel runtime information, +// extracted from the compile time kernel structures. +class __SYCL_EXPORT KernelRuntimeInfo { public: - ExtendedSubmissionInfo() {} + KernelRuntimeInfo() {} std::string_view &KernelName() { return MKernelName; } std::unique_ptr &HostKernel() { return MHostKernel; } - const std::unique_ptr &HostKernel() const { - return MHostKernel; - } int &KernelNumArgs() { return MKernelNumArgs; } KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; @@ -198,6 +197,13 @@ template event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); + +template +event submit_with_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); } // namespace detail } // namespace ext::oneapi::experimental @@ -3246,11 +3252,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + return submit_with_event(sycl::ext::oneapi::experimental::empty_properties_t{}, + Range, Rest..., CodeLoc); +#else return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); }, TlsCodeLocCapture.query()); +#endif } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3627,6 +3639,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); + template + friend event ext::oneapi::experimental::detail::submit_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); + template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3649,11 +3668,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - void ProcessExtendedSubmitProperties( + void ProcessKernelRuntimeInfo( PropertiesT Props, const KernelType &KernelFunc, - detail::v1::ExtendedSubmissionInfo &SI) const { - ProcessSubmitProperties(Props, SI); - + detail::v1::KernelRuntimeInfo &KRInfo) const { using NameT = typename detail::get_kernel_name_t::name; using LambdaArgType = sycl::detail::lambda_arg_type>; @@ -3661,15 +3678,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { std::is_integral::value && Dims == 1, item, typename TransformUserItemType::type>; - SI.HostKernel().reset( + KRInfo.HostKernel().reset( new detail::HostKernel( - std::forward(KernelFunc))); - SI.KernelName() = detail::getKernelName(); - SI.KernelNumArgs() = detail::getKernelNumParams(); - SI.KernelParamDescGetter() = &(detail::getKernelParamDesc); - SI.KernelIsESIMD() = detail::isKernelESIMD(); - SI.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); - SI.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + KernelFunc)); + KRInfo.KernelName() = detail::getKernelName(); + KRInfo.KernelNumArgs() = detail::getKernelNumParams(); + KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); + KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); + KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); + KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); } #ifndef __INTEL_PREVIEW_BREAKING_CHANGES @@ -3744,18 +3761,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { bool IsTopCodeLoc) const; event submit_with_event_impl( - const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<1> Range, + const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_with_event_impl( - const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<2> Range, + const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_with_event_impl( - const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<3> Range, + const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; /// A template-free version of submit_without_event as const member function. @@ -3843,19 +3863,22 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - event submit_with_event(PropertiesT Props, const nd_range Range, + event submit_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::ExtendedSubmissionInfo SI{}; - ProcessExtendedSubmitProperties(Props, KernelFunc, - SI); + detail::v1::SubmissionInfo SI{}; + detail::v1::KernelRuntimeInfo KRInfo{}; + + ProcessSubmitProperties(Props, SI); + ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); // TODO UseFallbackAssert - return submit_with_event_impl(Range, SI, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + return submit_with_event_impl(Range, SI, KRInfo, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } /// Submits a command group function object to the queue, in order to be diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e041ebb9d30f4..7bff56d1b8e3a 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -371,33 +371,39 @@ class queue_impl : public std::enable_shared_from_this { } event - submit_with_event(const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + submit_with_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { (void)Range; - (void)ExtSubmitInfo; + (void)SubmitInfo; + (void)KRInfo; (void)CodeLoc; (void)IsTopCodeLoc; return event(); } event - submit_with_event(const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + submit_with_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { (void)Range; - (void)ExtSubmitInfo; + (void)SubmitInfo; + (void)KRInfo; (void)CodeLoc; (void)IsTopCodeLoc; return event(); } event - submit_with_event(const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + submit_with_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { (void)Range; - (void)ExtSubmitInfo; + (void)SubmitInfo; + (void)KRInfo; (void)CodeLoc; (void)IsTopCodeLoc; return event(); diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index bd899d871f0d8..422148c0ec8f9 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -313,24 +313,30 @@ event queue::submit_with_event_impl( } event queue::submit_with_event_impl( - const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); + return impl->submit_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); } event queue::submit_with_event_impl( - const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); + return impl->submit_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); } event queue::submit_with_event_impl( - const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); + return impl->submit_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); } void queue::submit_without_event_impl( From 591b3ec4d6c955089137e672639a36ee3a72d205 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 8 Jul 2025 13:41:07 +0000 Subject: [PATCH 05/65] Added copy/move constructor and assignment operator --- sycl/include/sycl/queue.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b2a1106aa051b..b78a1e6770eca 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -157,14 +157,30 @@ class __SYCL_EXPORT KernelRuntimeInfo { public: KernelRuntimeInfo() {} + KernelRuntimeInfo(const KernelRuntimeInfo &rhs) = delete; + + KernelRuntimeInfo(KernelRuntimeInfo &&rhs) = delete; + + KernelRuntimeInfo &operator=(const KernelRuntimeInfo &rhs) = delete; + + KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; + std::string_view &KernelName() { return MKernelName; } + std::unique_ptr &HostKernel() { return MHostKernel; } + + detail::HostKernelBase *GetHostKernelPtr() { return MHostKernel.get(); } + int &KernelNumArgs() { return MKernelNumArgs; } + KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; } + bool &KernelIsESIMD() { return MKernelIsESIMD; } + bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } + detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; } From 664160133196383646de49dd4d3b6675acf70fc1 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 11 Jul 2025 14:23:44 +0000 Subject: [PATCH 06/65] Add a no event submit and no handler compile flag --- sycl/CMakeLists.txt | 1 + .../oneapi/experimental/enqueue_functions.hpp | 25 +++- sycl/include/sycl/queue.hpp | 131 ++++++++++++++---- sycl/source/CMakeLists.txt | 8 ++ sycl/source/detail/queue_impl.cpp | 97 +++++++++++++ sycl/source/detail/queue_impl.hpp | 70 +++++++--- sycl/source/queue.cpp | 30 ++++ 7 files changed, 318 insertions(+), 44 deletions(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 9b98bef1a849c..fdba7b5da02bb 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -11,6 +11,7 @@ option(SYCL_ADD_DEV_VERSION_POSTFIX "Adds -V postfix to version string" ON) option(SYCL_ENABLE_COVERAGE "Enables code coverage for runtime and unit tests" OFF) option(SYCL_ENABLE_STACK_PRINTING "Enables stack printing on crashes of SYCL applications" OFF) option(SYCL_LIB_WITH_DEBUG_SYMBOLS "Builds SYCL runtime libraries with debug symbols" OFF) +option(SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT "Builds SYCL runtime libraries with the support for Command Group Handler bypass kernel submit" OFF) if (NOT SYCL_COVERAGE_PATH) set(SYCL_COVERAGE_PATH "${CMAKE_CURRENT_BINARY_DIR}/profiles") diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 5e2a6ec78feb4..25c7cdcbf2e6e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -111,6 +111,7 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, Props, detail::type_erased_cgfo_ty{CGF}, nullptr, CodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template event submit_with_event_impl(const queue &Q, PropertiesT Props, @@ -120,6 +121,17 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); } + +template +void submit_without_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + //Q.submit_without_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, + // KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail template @@ -165,7 +177,18 @@ event submit_with_event(const queue &Q, PropertiesT Props, return sycl::ext::oneapi::experimental::detail::submit_with_event_impl (Q, Props, Range, KernelFunc, CodeLoc); } -#endif + +template +void submit_without_event(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + sycl::ext::oneapi::experimental::detail::submit_without_event_impl + (Q, Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void single_task(handler &CGH, const KernelType &KernelObj) { diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b78a1e6770eca..a3f698c553e99 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,6 +149,7 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -165,35 +166,48 @@ class __SYCL_EXPORT KernelRuntimeInfo { KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; - std::string_view &KernelName() { return MKernelName; } + detail::ABINeutralKernelNameStrT &KernelName() { return MKernelName; } + const detail::ABINeutralKernelNameStrT &KernelName() const { return MKernelName; } - std::unique_ptr &HostKernel() { return MHostKernel; } + std::shared_ptr &HostKernel() { return MHostKernel; } + const std::shared_ptr &HostKernel() const { return MHostKernel; } - detail::HostKernelBase *GetHostKernelPtr() { return MHostKernel.get(); } + char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } + char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } int &KernelNumArgs() { return MKernelNumArgs; } + const int &KernelNumArgs() const { return MKernelNumArgs; } KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; } + const KernelParamDescGetterFuncPtr &KernelParamDescGetter() const { + return MKernelParamDescGetter; + } bool &KernelIsESIMD() { return MKernelIsESIMD; } + const bool &KernelIsESIMD() const { return MKernelIsESIMD; } bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } + const bool &KernelHasSpecialCaptures() const { return MKernelHasSpecialCaptures; } detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; } + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr() const { + return MKernelNameBasedCachePtr; + } private: - std::string_view MKernelName; - std::unique_ptr MHostKernel; + detail::ABINeutralKernelNameStrT MKernelName; + std::shared_ptr MHostKernel; int MKernelNumArgs = 0; KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; bool MKernelIsESIMD = false; bool MKernelHasSpecialCaptures = true; detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace v1 } // namespace detail @@ -214,12 +228,21 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template event submit_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + +template +void submit_without_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail } // namespace ext::oneapi::experimental @@ -3268,17 +3291,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - return submit_with_event(sycl::ext::oneapi::experimental::empty_properties_t{}, - Range, Rest..., CodeLoc); -#else return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); }, TlsCodeLocCapture.query()); -#endif } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3655,6 +3672,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template friend event ext::oneapi::experimental::detail::submit_with_event_impl( @@ -3662,6 +3680,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + template + friend void ext::oneapi::experimental::detail::submit_without_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3682,13 +3708,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void ProcessKernelRuntimeInfo( PropertiesT Props, const KernelType &KernelFunc, detail::v1::KernelRuntimeInfo &KRInfo) const { - using NameT = - typename detail::get_kernel_name_t::name; + using LambdaArgType = sycl::detail::lambda_arg_type>; using TransformedArgType = std::conditional_t< std::is_integral::value && Dims == 1, item, @@ -3697,13 +3723,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KRInfo.HostKernel().reset( new detail::HostKernel( KernelFunc)); - KRInfo.KernelName() = detail::getKernelName(); - KRInfo.KernelNumArgs() = detail::getKernelNumParams(); - KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); - KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); - KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); - KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + KRInfo.KernelName() = detail::getKernelName(); + KRInfo.KernelNumArgs() = detail::getKernelNumParams(); + KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); + KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); + KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); + KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. @@ -3776,24 +3803,44 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT event submit_with_event_impl( nd_range<1> Range, - const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_with_event_impl( nd_range<2> Range, - const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_with_event_impl( nd_range<3> Range, - const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_without_event_impl( + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + void submit_without_event_impl( + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_without_event_impl( + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3877,7 +3924,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template event submit_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3887,9 +3936,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; + using NameT = + typename detail::get_kernel_name_t::name; + ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(Props, + KernelFunc, KRInfo); + + // TODO call the kernel_parallel_for wrapper // TODO UseFallbackAssert @@ -3897,6 +3951,33 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } + template + void submit_without_event(PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::v1::SubmissionInfo SI{}; + detail::v1::KernelRuntimeInfo KRInfo{}; + + using NameT = + typename detail::get_kernel_name_t::name; + + ProcessSubmitProperties(Props, SI); + ProcessKernelRuntimeInfo(Props, + KernelFunc, KRInfo); + + // TODO call the kernel_parallel_for wrapper + + // TODO UseFallbackAssert + + submit_without_event_impl(Range, SI, KRInfo, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index dd1b833383055..0af3cf995b600 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -56,6 +56,14 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME) $<$:__SYCL_BUILD_SYCL_DLL> ) + if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) + target_compile_definitions( + ${LIB_OBJ_NAME} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) + endif() + target_include_directories( ${LIB_OBJ_NAME} PRIVATE diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index eaa4955848535..9195c73915e85 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -415,6 +415,103 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } #endif +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + +std::vector queue_impl::extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams) { + + size_t IndexShift = 0; + std::vector Args; + + Args.reserve(NumKernelParams); + + for (size_t I = 0; I < NumKernelParams; ++I) { + detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I); + void *Ptr = LambdaPtr + ParamDesc.offset; + const detail::kernel_param_kind_t &Kind = ParamDesc.kind; + const int &Size = ParamDesc.info; + + Args.emplace_back(Kind, Ptr, Size, I + IndexShift); + } + + return Args; +} + +detail::EventImplPtr +queue_impl::submit_impl(const NDRDescT &NDRDesc, + const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + + std::unique_ptr CommandGroup; + detail::CG::StorageInitHelper CGData; + std::vector Args; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + std::unique_lock Lock(MMutex); + + // Graphs are not supported yet for the no-handler path + assert(!hasCommandGraph()); + + // Set the No Last Event Mode to false, since the no-handler path + // does not support it yet. + MNoLastEventMode.store(false, std::memory_order_relaxed); + + // Used by queue_empty() and getLastEvent() + MEmpty.store(false, std::memory_order_release); + + // Sync with an external event + std::optional ExternalEvent = popExternalEvent(); + if (ExternalEvent) { + CGData.MEvents.push_back(getSyclObjImpl(*ExternalEvent)); + } + + // Sync with the last event for in order queue + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + if (isInOrder() && LastEvent) { + CGData.MEvents.push_back(LastEvent); + } + + // TODO UnenqueuedCmdEvents for out of order queue + + Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), + KRInfo.KernelParamDescGetter(), KRInfo.KernelNumArgs()); + + CommandGroup.reset(new detail::CGExecKernel( + std::move(NDRDesc), + KRInfo.HostKernel(), + nullptr, // MKernel + nullptr, // MKernelBundle + std::move(CGData), + std::move(Args), + toKernelNameStrT(KRInfo.KernelName()), + KRInfo.KernelNameBasedCachePtr(), + std::move(StreamStorage), + std::move(AuxiliaryResources), + detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + CodeLoc)); + + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), *this, CallerNeedsEvent); + + // Sync with the last event for in order queue + if (isInOrder() && !EventImpl->isDiscarded()) { + LastEvent = EventImpl; + } + + return CallerNeedsEvent ? EventImpl : nullptr; +} + +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template event queue_impl::submitWithHandler(const std::vector &DepEvents, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e9a6edbe0a04a..cb3f3e938d8d8 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,17 +370,16 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT event submit_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - (void)Range; - (void)SubmitInfo; - (void)KRInfo; - (void)CodeLoc; - (void)IsTopCodeLoc; - return event(); + detail::EventImplPtr EventImpl = + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); } event @@ -388,12 +387,10 @@ class queue_impl : public std::enable_shared_from_this { const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - (void)Range; - (void)SubmitInfo; - (void)KRInfo; - (void)CodeLoc; - (void)IsTopCodeLoc; - return event(); + detail::EventImplPtr EventImpl = + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); } event @@ -401,14 +398,40 @@ class queue_impl : public std::enable_shared_from_this { const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - (void)Range; - (void)SubmitInfo; - (void)KRInfo; - (void)CodeLoc; - (void)IsTopCodeLoc; - return event(); + detail::EventImplPtr EventImpl = + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); } + void + submit_without_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } + + void + submit_without_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } + + void + submit_without_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, const detail::code_location &Loc, @@ -944,6 +967,10 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const SubmissionInfo &SubmitInfo); #endif + std::vector extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams); + /// Performs command group submission to the queue. /// /// \param CGF is a function object containing command group. @@ -960,6 +987,13 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); + detail::EventImplPtr submit_impl(const NDRDescT &NDRDesc, + const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc); + /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 422148c0ec8f9..e854579b74cb2 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -312,6 +312,7 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT event queue::submit_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, @@ -339,6 +340,35 @@ event queue::submit_with_event_impl( CodeLoc, IsTopCodeLoc); } +void queue::submit_without_event_impl( + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +void queue::submit_without_event_impl( + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +void queue::submit_without_event_impl( + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, From a6e711e8ab4dca986a61a52966f55b93adcdc347 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 14 Jul 2025 13:01:14 +0000 Subject: [PATCH 07/65] Added a new configure option to build no handler submit path, changed the no handler private submit function names to submit "direct", addressed review comments. --- .../oneapi/experimental/enqueue_functions.hpp | 39 ++++++----- sycl/include/sycl/queue.hpp | 70 ++++++++++++++----- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/queue_impl.hpp | 29 ++++---- sycl/source/queue.cpp | 24 +++---- 5 files changed, 102 insertions(+), 62 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 25c7cdcbf2e6e..c20112fde560f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -114,22 +114,21 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template -event submit_with_event_impl(const queue &Q, PropertiesT Props, +void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, + Q.submit_direct_without_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); } - template -void submit_without_event_impl(const queue &Q, PropertiesT Props, +event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - //Q.submit_without_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, - // KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); + return Q.submit_direct_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, + KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail @@ -149,11 +148,24 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, submit(Q, empty_properties_t{}, std::forward(CGF), CodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void submit(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + sycl::ext::oneapi::experimental::detail::submit_direct_impl + (Q, Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template event submit_with_event(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { + sycl::detail::code_location::current()) { return sycl::ext::oneapi::experimental::detail::submit_with_event_impl( Q, Props, std::forward(CGF), CodeLoc); } @@ -174,18 +186,7 @@ event submit_with_event(const queue &Q, PropertiesT Props, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail::submit_with_event_impl - (Q, Props, Range, KernelFunc, CodeLoc); -} - -template -void submit_without_event(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - sycl::ext::oneapi::experimental::detail::submit_without_event_impl + return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl (Q, Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a3f698c553e99..a7ed6aed9e6d7 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -231,14 +231,14 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template -event submit_with_event_impl(const queue &Q, PropertiesT Props, +void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); template -void submit_without_event_impl(const queue &Q, PropertiesT Props, +event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); @@ -3675,17 +3675,18 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template - friend event ext::oneapi::experimental::detail::submit_with_event_impl( + friend void ext::oneapi::experimental::detail::submit_direct_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); template - friend void ext::oneapi::experimental::detail::submit_without_event_impl( + friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template @@ -3804,37 +3805,37 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { bool IsTopCodeLoc) const; #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - event submit_with_event_impl( + event submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl( + event submit_direct_with_event_impl( nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl( + event submit_direct_with_event_impl( nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_without_event_impl( + void submit_direct_without_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_without_event_impl( + void submit_direct_without_event_impl( nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_without_event_impl( + void submit_direct_without_event_impl( nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -3925,10 +3926,45 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + +#ifdef SYCL_LANGUAGE_VERSION +#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS +#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel_entry_point(KernelName)]] +#else +#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]] +#endif // __INTEL_SYCL_USE_INTEGRATION_HEADERS +#else +#define __SYCL_KERNEL_ATTR__ +#endif // SYCL_LANGUAGE_VERSION + + // TODO The kernel wrapper functions have to be moved from the handler class + // to a place where they are accessible by both the handler and the queue class. + // For test purposes, this wrapper is a copy of the handler wrapper, + // but an aplication which would use both handler and no-handler APIs, + // won't compile. + // + // NOTE: the name of these functions - "kernel_parallel_for" - are used by the + // Front End to determine kernel invocation kind. + template +#ifdef __SYCL_DEVICE_ONLY__ + [[__sycl_detail__::add_ir_attributes_function( + ext::oneapi::experimental::detail::PropertyMetaInfo::name..., + ext::oneapi::experimental::detail::PropertyMetaInfo::value...)]] +#endif + __SYCL_KERNEL_ATTR__ static void + kernel_parallel_for(const KernelType &KernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + KernelFunc(detail::Builder::getElement(detail::declptr())); +#else + (void)KernelFunc; +#endif + } + template - event submit_with_event(PropertiesT Props, nd_range Range, + event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { @@ -3943,18 +3979,19 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); - // TODO call the kernel_parallel_for wrapper + kernel_parallel_for, KernelType, + ext::oneapi::experimental::empty_properties_t>(KernelFunc); // TODO UseFallbackAssert - return submit_with_event_impl(Range, SI, KRInfo, + return submit_direct_with_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } template - void submit_without_event(PropertiesT Props, nd_range Range, + void submit_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { @@ -3969,11 +4006,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); - // TODO call the kernel_parallel_for wrapper + kernel_parallel_for, KernelType, + ext::oneapi::experimental::empty_properties_t>(KernelFunc); // TODO UseFallbackAssert - submit_without_event_impl(Range, SI, KRInfo, + submit_direct_without_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 7d5db8cc22254..50ad60689a45e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -439,7 +439,7 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( } detail::EventImplPtr -queue_impl::submit_impl(const NDRDescT &NDRDesc, +queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 7c4da21778a4a..269ac27d8843b 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -372,62 +372,62 @@ class queue_impl : public std::enable_shared_from_this { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT event - submit_with_event(nd_range<1> Range, + submit_direct_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event - submit_with_event(nd_range<2> Range, + submit_direct_with_event(nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event - submit_with_event(nd_range<3> Range, + submit_direct_with_event(nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } void - submit_without_event(nd_range<1> Range, + submit_direct_without_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } void - submit_without_event(nd_range<2> Range, + submit_direct_without_event(nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } void - submit_without_event(nd_range<3> Range, + submit_direct_without_event(nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -987,13 +987,14 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); - detail::EventImplPtr submit_impl(const NDRDescT &NDRDesc, +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); - +#endif /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index e854579b74cb2..be599169b0d98 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -313,57 +313,57 @@ event queue::submit_with_event_impl( } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -event queue::submit_with_event_impl( +event queue::submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, SubmitInfo, KRInfo, + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_with_event_impl( +event queue::submit_direct_with_event_impl( nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, SubmitInfo, KRInfo, + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_with_event_impl( +event queue::submit_direct_with_event_impl( nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, SubmitInfo, KRInfo, + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_without_event_impl( +void queue::submit_direct_without_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_without_event(Range, SubmitInfo, KRInfo, + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_without_event_impl( +void queue::submit_direct_without_event_impl( nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_without_event(Range, SubmitInfo, KRInfo, + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_without_event_impl( +void queue::submit_direct_without_event_impl( nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_without_event(Range, SubmitInfo, KRInfo, + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } From 9c8040ec47959c35ee4fabb928e349ed69748380 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 17 Jul 2025 12:35:04 +0000 Subject: [PATCH 08/65] Host task dependency test --- sycl/cmake/modules/AddSYCLUnitTest.cmake | 10 +++- .../oneapi/experimental/enqueue_functions.hpp | 56 +++++++++++++++---- .../sycl/khr/free_function_commands.hpp | 15 +++++ sycl/include/sycl/queue.hpp | 55 ++++++++++++------ .../scheduler/InOrderQueueHostTaskDeps.cpp | 44 ++++++++++++++- 5 files changed, 152 insertions(+), 28 deletions(-) diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index 3fdc6fde119bc..39a4798696429 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -115,6 +115,14 @@ macro(add_sycl_unittest test_dirname link_variant) -Wno-inconsistent-missing-override ) endif() - + target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING) + + if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) + target_compile_definitions( + ${test_dirname} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) + endif() endmacro() diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index c20112fde560f..123e883ccd10f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -112,23 +112,23 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, - KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); + Q.submit_direct_without_event(Props, Range, KernelFunc, CodeLoc); } -template event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, - KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); + return Q.submit_direct_with_event(Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail @@ -149,7 +149,7 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template void submit(const queue &Q, PropertiesT Props, nd_range Range, @@ -157,7 +157,7 @@ void submit(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { sycl::ext::oneapi::experimental::detail::submit_direct_impl - (Q, Props, Range, KernelFunc, CodeLoc); + (Q, Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -179,7 +179,7 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, @@ -187,7 +187,7 @@ event submit_with_event(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl - (Q, Props, Range, KernelFunc, CodeLoc); + (Q, Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -303,6 +303,21 @@ void nd_launch(handler &CGH, nd_range Range, KernelObj); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, + ReductionsT &&...Reductions) { + if constexpr (sizeof...(ReductionsT) == 0) { + submit(std::move(Q), empty_properties_t{}, Range, KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Range, KernelObj, + std::forward(Reductions)...); + }); + } +} +#else template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, @@ -312,6 +327,7 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, std::forward(Reductions)...); }); } +#endif template @@ -327,6 +343,25 @@ void nd_launch(handler &CGH, std::forward(Reductions)..., KernelObj); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void nd_launch(queue Q, launch_config, Properties> Config, + const KernelType &KernelObj, ReductionsT &&...Reductions) { + if constexpr (sizeof...(ReductionsT) == 0) { + ext::oneapi::experimental::detail::LaunchConfigAccess, + Properties> + ConfigAccess(Config); + submit(std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), + KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); + } +} +#else template void nd_launch(queue Q, launch_config, Properties> Config, @@ -336,6 +371,7 @@ void nd_launch(queue Q, launch_config, Properties> Config, std::forward(Reductions)...); }); } +#endif template void nd_launch(handler &CGH, nd_range Range, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 4138edd5821e6..e45bcdeca7d4e 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -153,27 +153,42 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<1>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template void launch_grouped(const queue &q, range<2> r, range<2> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<2>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template void launch_grouped(const queue &q, range<3> r, range<3> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<3>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a7ed6aed9e6d7..cb0227e721418 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -229,14 +229,14 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -template event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, @@ -3279,6 +3279,31 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query()); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// parallel_for version with a kernel represented as a lambda + nd_range that + /// specifies global, local sizes and offset. + /// + /// \param Range specifies the global and local work spaces of the kernel + /// \param Rest acts as-if: "ReductionTypes&&... Reductions, + /// const KernelType &KernelFunc". + template + std::enable_if_t::value, event> + parallel_for(nd_range Range, RestT &&...Rest) { + constexpr detail::code_location CodeLoc = getCodeLocation(); + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + if constexpr (sizeof...(RestT) == 1) { + return submit_direct_with_event(ext::oneapi::experimental::empty_properties_t{}, + Range, Rest...); + } else { + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); + } + } +#else /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3297,7 +3322,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }, TlsCodeLocCapture.query()); } - +#endif /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3673,14 +3698,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const sycl::detail::code_location &CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template friend void ext::oneapi::experimental::detail::submit_direct_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); - template friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, @@ -3710,7 +3735,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }; #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template void ProcessKernelRuntimeInfo( PropertiesT Props, const KernelType &KernelFunc, @@ -3961,9 +3986,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #endif } - template + template event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = @@ -3976,11 +4000,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, + ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); kernel_parallel_for, KernelType, - ext::oneapi::experimental::empty_properties_t>(KernelFunc); + PropertiesT>(KernelFunc); // TODO UseFallbackAssert @@ -3988,9 +4012,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } - template + template void submit_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = @@ -4003,11 +4026,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, + ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); kernel_parallel_for, KernelType, - ext::oneapi::experimental::empty_properties_t>(KernelFunc); + PropertiesT>(KernelFunc); // TODO UseFallbackAssert diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 31a2914e2c803..b10ec4173e364 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -48,7 +48,7 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { EXPECT_EQ(GEventsWaitCounter, expectedCount); } -enum class CommandType { KERNEL = 1, MEMSET = 2 }; +enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 }; std::vector> ExecutedCommands; inline ur_result_t customEnqueueKernelLaunch(void *pParams) { @@ -162,3 +162,45 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } + +TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { + ExecutedCommands.clear(); + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", + &customEnqueueKernelLaunch); + + sycl::platform Plt = sycl::platform(); + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + ExecutedCommands.push_back( + {CommandType::HOST_TASK, 0}); + }); + }); + + event Ev2 = InOrderQueue.parallel_for(nd_range<1>{range{32}, range{32}}, + [](nd_item<1>) {}); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + InOrderQueue.wait(); + + ASSERT_EQ(ExecutedCommands.size(), 2u); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK); + EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); + EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); +} From 31cbdb9fa3e772b07a60ed3799f33488a4f59cf5 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 18 Jul 2025 09:21:16 +0000 Subject: [PATCH 09/65] Add a check for special captures --- sycl/source/detail/queue_impl.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 50ad60689a45e..019103c391362 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -457,6 +457,9 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, // Graphs are not supported yet for the no-handler path assert(!hasCommandGraph()); + // No special captures supported yet for the no-handler path + assert(!KRInfo.KernelHasSpecialCaptures); + // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. MNoLastEventMode.store(false, std::memory_order_relaxed); From 998d592ea3349ac29902c51939c98d72ba2e21ef Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 11 Aug 2025 15:32:42 +0000 Subject: [PATCH 10/65] Switch to the common kernel wrappers, fix the KRInfo function call --- sycl/include/sycl/queue.hpp | 42 +++---------------------------- sycl/source/detail/queue_impl.cpp | 2 +- 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index cb0227e721418..7a94323929633 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3952,40 +3952,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -#ifdef SYCL_LANGUAGE_VERSION -#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS -#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel_entry_point(KernelName)]] -#else -#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]] -#endif // __INTEL_SYCL_USE_INTEGRATION_HEADERS -#else -#define __SYCL_KERNEL_ATTR__ -#endif // SYCL_LANGUAGE_VERSION - - // TODO The kernel wrapper functions have to be moved from the handler class - // to a place where they are accessible by both the handler and the queue class. - // For test purposes, this wrapper is a copy of the handler wrapper, - // but an aplication which would use both handler and no-handler APIs, - // won't compile. - // - // NOTE: the name of these functions - "kernel_parallel_for" - are used by the - // Front End to determine kernel invocation kind. - template -#ifdef __SYCL_DEVICE_ONLY__ - [[__sycl_detail__::add_ir_attributes_function( - ext::oneapi::experimental::detail::PropertyMetaInfo::name..., - ext::oneapi::experimental::detail::PropertyMetaInfo::value...)]] -#endif - __SYCL_KERNEL_ATTR__ static void - kernel_parallel_for(const KernelType &KernelFunc) { -#ifdef __SYCL_DEVICE_ONLY__ - KernelFunc(detail::Builder::getElement(detail::declptr())); -#else - (void)KernelFunc; -#endif - } - template event submit_direct_with_event(PropertiesT Props, nd_range Range, @@ -4003,8 +3969,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); - kernel_parallel_for, KernelType, - PropertiesT>(KernelFunc); + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); // TODO UseFallbackAssert @@ -4029,8 +3995,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); - kernel_parallel_for, KernelType, - PropertiesT>(KernelFunc); + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); // TODO UseFallbackAssert diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c77aadc9bc59d..2330b46130a87 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -458,7 +458,7 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, assert(!hasCommandGraph()); // No special captures supported yet for the no-handler path - assert(!KRInfo.KernelHasSpecialCaptures); + assert(!KRInfo.KernelHasSpecialCaptures()); // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. From 4000c07367ec3ccd21a91c150f5fb4a541e3258a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 12 Aug 2025 09:36:17 +0000 Subject: [PATCH 11/65] Enable no handler in the preview lib build, add no handler unit tests target --- sycl/CMakeLists.txt | 1 - sycl/cmake/modules/AddSYCLUnitTest.cmake | 37 +++++++++++++++--------- sycl/include/sycl/queue.hpp | 8 ++--- sycl/source/CMakeLists.txt | 8 ----- sycl/source/detail/queue_impl.cpp | 4 +-- sycl/source/detail/queue_impl.hpp | 8 ++--- sycl/source/queue.cpp | 4 +-- sycl/unittests/xpti_trace/CMakeLists.txt | 1 + 8 files changed, 36 insertions(+), 35 deletions(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 7a8a60fba2ce8..20e940fce9f91 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -11,7 +11,6 @@ option(SYCL_ADD_DEV_VERSION_POSTFIX "Adds -V postfix to version string" ON) option(SYCL_ENABLE_COVERAGE "Enables code coverage for runtime and unit tests" OFF) option(SYCL_ENABLE_STACK_PRINTING "Enables stack printing on crashes of SYCL applications" OFF) option(SYCL_LIB_WITH_DEBUG_SYMBOLS "Builds SYCL runtime libraries with debug symbols" OFF) -option(SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT "Builds SYCL runtime libraries with the support for Command Group Handler bypass kernel submit" OFF) if (NOT SYCL_COVERAGE_PATH) set(SYCL_COVERAGE_PATH "${CMAKE_CURRENT_BINARY_DIR}/profiles") diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index 8791dec59e0e1..e581dfaa61ba0 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -1,6 +1,6 @@ # Internal function to create SYCL unit tests with code reuse -# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview file1.cpp, file2.cpp ...) -function(add_sycl_unittest_internal test_dirname link_variant is_preview) +# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview is_no_cgh file1.cpp, file2.cpp ...) +function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_cgh) # Enable exception handling for these unit tests set(LLVM_REQUIRES_EH ON) set(LLVM_REQUIRES_RTTI ON) @@ -34,7 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) # Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its # parent scope. if (${is_preview}) - set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview") + if (${is_no_cgh}) + set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/PreviewNoCGH") + else() + set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview") + endif() endif() if ("${link_variant}" MATCHES "SHARED") @@ -62,7 +66,19 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) if (${is_preview}) target_compile_definitions(${test_dirname} PRIVATE __INTEL_PREVIEW_BREAKING_CHANGES) - set(sycl_cache_suffix "_preview") + if (${is_no_cgh}) + set(sycl_cache_suffix "_preview_no_cgh") + else() + set(sycl_cache_suffix "_preview") + endif() + endif() + + if (${is_no_cgh}) + target_compile_definitions( + ${test_dirname} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) endif() if (SYCL_ENABLE_XPTI_TRACING) @@ -151,14 +167,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) endif() target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING) - - if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) - target_compile_definitions( - ${test_dirname} - PRIVATE - __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - ) - endif() endfunction() # add_sycl_unittest(test_name_prefix SHARED|OBJECT file1.cpp, file2.cpp ...) @@ -167,6 +175,7 @@ endfunction() # the SYCL preview features enabled. # Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)` macro(add_sycl_unittest test_name_prefix link_variant) - add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE ${ARGN}) - add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_preview_no_cgh ${link_variant} TRUE TRUE ${ARGN}) endmacro() diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 7a94323929633..1f4a9b3d9df00 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,7 +149,7 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -207,7 +207,7 @@ class __SYCL_EXPORT KernelRuntimeInfo { bool MKernelHasSpecialCaptures = true; detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES } // namespace v1 } // namespace detail @@ -3829,7 +3829,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3865,7 +3865,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index ff8bce4de2b1f..24a471eacb0f6 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -56,14 +56,6 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME) $<$:__SYCL_BUILD_SYCL_DLL> ) - if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) - target_compile_definitions( - ${LIB_OBJ_NAME} - PRIVATE - __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - ) - endif() - target_include_directories( ${LIB_OBJ_NAME} PRIVATE diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2330b46130a87..80de9d7d9dd5f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -415,7 +415,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } #endif -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector queue_impl::extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), @@ -513,7 +513,7 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, return CallerNeedsEvent ? EventImpl : nullptr; } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES template event queue_impl::submitWithHandler(const std::vector &DepEvents, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 484b9e855b53f..18ed4025e20d2 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,7 +370,7 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, @@ -430,7 +430,7 @@ class queue_impl : public std::enable_shared_from_this { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, @@ -988,14 +988,14 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -#endif +#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index be599169b0d98..6817386e6c22d 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -312,7 +312,7 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event queue::submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, @@ -367,7 +367,7 @@ void queue::submit_direct_without_event_impl( CodeLoc, IsTopCodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, diff --git a/sycl/unittests/xpti_trace/CMakeLists.txt b/sycl/unittests/xpti_trace/CMakeLists.txt index 44574d36a9aa4..42c5505d86107 100644 --- a/sycl/unittests/xpti_trace/CMakeLists.txt +++ b/sycl/unittests/xpti_trace/CMakeLists.txt @@ -8,3 +8,4 @@ add_sycl_unittest(XptiTraceTests OBJECT ) target_link_libraries(XptiTraceTests_non_preview PRIVATE xpti xptitest_subscriber) target_link_libraries(XptiTraceTests_preview PRIVATE xpti xptitest_subscriber) +target_link_libraries(XptiTraceTests_preview_no_cgh PRIVATE xpti xptitest_subscriber) From 01af8bbf312af89a12a20ce5cf481d55c48bc579 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 12 Aug 2025 11:13:32 +0000 Subject: [PATCH 12/65] Unused argument fix and IsTopCodeLoc assignment --- sycl/source/detail/queue_impl.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 80de9d7d9dd5f..0ffb66bb74c7b 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -445,6 +445,7 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + (void)SubmitInfo; std::unique_ptr CommandGroup; detail::CG::StorageInitHelper CGData; @@ -502,6 +503,8 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, 0, // MKernelWorkGroupMemorySize CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( std::move(CommandGroup), *this, CallerNeedsEvent); From 4469e59fd5afeca91fa4cefdd39ba271f96fb141 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 13 Aug 2025 09:56:39 +0000 Subject: [PATCH 13/65] Implemented the barrier and un-enqueued commands synchronization for out of order queues. Fixed an #ifdef. --- sycl/include/sycl/queue.hpp | 2 +- sycl/source/detail/queue_impl.cpp | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 1f4a9b3d9df00..3eb592ad77343 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3727,6 +3727,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template struct TransformUserItemType { using type = std::conditional_t< std::is_convertible_v, LambdaArgType>, nd_item, @@ -3734,7 +3735,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void ProcessKernelRuntimeInfo( diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 0ffb66bb74c7b..f058f596f7c70 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -480,7 +480,19 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, CGData.MEvents.push_back(LastEvent); } - // TODO UnenqueuedCmdEvents for out of order queue + // Barrier and un-enqueued commands synchronization for out or order queue + if (!isInOrder()) { + MMissedCleanupRequests.unset( + [&](MissedCleanupRequestsType &MissedCleanupRequests) { + for (auto &UpdatedGraph : MissedCleanupRequests) + doUnenqueuedCommandCleanup(UpdatedGraph); + MissedCleanupRequests.clear(); + }); + + if (MDefaultGraphDeps.LastBarrier && !MDefaultGraphDeps.LastBarrier->isEnqueued()) { + CGData.MEvents.push_back(MDefaultGraphDeps.LastBarrier); + } + } Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), KRInfo.KernelParamDescGetter(), KRInfo.KernelNumArgs()); @@ -513,6 +525,11 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, LastEvent = EventImpl; } + // Barrier and un-enqueued commands synchronization for out or order queue + if (!isInOrder() && !EventImpl->isEnqueued()) { + MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl); + } + return CallerNeedsEvent ? EventImpl : nullptr; } From ac1a5cf44db11577ec751db9a15ebe8d150a1e67 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 13 Aug 2025 10:18:28 +0000 Subject: [PATCH 14/65] Fix formatting --- .../oneapi/experimental/enqueue_functions.hpp | 58 ++++----- .../sycl/khr/free_function_commands.hpp | 6 +- sycl/include/sycl/queue.hpp | 110 +++++++++--------- sycl/source/detail/queue_impl.cpp | 42 +++---- sycl/source/detail/queue_impl.hpp | 95 ++++++++------- sycl/source/queue.cpp | 42 +++---- .../scheduler/InOrderQueueHostTaskDeps.cpp | 7 +- 7 files changed, 172 insertions(+), 188 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 123e883ccd10f..d4a6cac2c560f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -112,23 +112,23 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template -void submit_direct_impl(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event(Props, Range, KernelFunc, CodeLoc); -} -template -event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, - nd_range Range, +template +void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event(Props, Range, KernelFunc, CodeLoc); + Q.submit_direct_without_event( + Props, Range, KernelFunc, CodeLoc); +} +template +event submit_direct_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { + return Q.submit_direct_with_event( + Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail @@ -151,13 +151,13 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template -void submit(const queue &Q, PropertiesT Props, - nd_range Range, +void submit(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - sycl::ext::oneapi::experimental::detail::submit_direct_impl - (Q, Props, Range, KernelFunc, CodeLoc); + sycl::detail::code_location::current()) { + sycl::ext::oneapi::experimental::detail::submit_direct_impl< + KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, + CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -181,13 +181,13 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template -event submit_with_event(const queue &Q, PropertiesT Props, - nd_range Range, +event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl - (Q, Props, Range, KernelFunc, CodeLoc); + sycl::detail::code_location::current()) { + return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl< + KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, + CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -350,10 +350,10 @@ void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { if constexpr (sizeof...(ReductionsT) == 0) { ext::oneapi::experimental::detail::LaunchConfigAccess, - Properties> - ConfigAccess(Config); - submit(std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), - KernelObj); + Properties> + ConfigAccess(Config); + submit(std::move(Q), ConfigAccess.getProperties(), + ConfigAccess.getRange(), KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index e45bcdeca7d4e..13ba0c8f56454 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -155,7 +155,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, - nd_range<1>(r, size), k); + nd_range<1>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -169,7 +169,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, - nd_range<2>(r, size), k); + nd_range<2>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -183,7 +183,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, - nd_range<3>(r, size), k); + nd_range<3>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 3eb592ad77343..ffb40895d43bb 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -167,10 +167,14 @@ class __SYCL_EXPORT KernelRuntimeInfo { KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; detail::ABINeutralKernelNameStrT &KernelName() { return MKernelName; } - const detail::ABINeutralKernelNameStrT &KernelName() const { return MKernelName; } + const detail::ABINeutralKernelNameStrT &KernelName() const { + return MKernelName; + } std::shared_ptr &HostKernel() { return MHostKernel; } - const std::shared_ptr &HostKernel() const { return MHostKernel; } + const std::shared_ptr &HostKernel() const { + return MHostKernel; + } char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } @@ -189,7 +193,9 @@ class __SYCL_EXPORT KernelRuntimeInfo { const bool &KernelIsESIMD() const { return MKernelIsESIMD; } bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } - const bool &KernelHasSpecialCaptures() const { return MKernelHasSpecialCaptures; } + const bool &KernelHasSpecialCaptures() const { + return MKernelHasSpecialCaptures; + } detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; @@ -229,19 +235,18 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template -void submit_direct_impl(const queue &Q, PropertiesT Props, - nd_range Range, +template +void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -template +template event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail } // namespace ext::oneapi::experimental @@ -3293,8 +3298,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event(ext::oneapi::experimental::empty_properties_t{}, - Range, Rest...); + return submit_direct_with_event( + ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( [&](handler &CGH) { @@ -3698,19 +3703,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const sycl::detail::code_location &CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template + template friend void ext::oneapi::experimental::detail::submit_direct_impl( const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); - template + template friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -3737,9 +3740,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - void ProcessKernelRuntimeInfo( - PropertiesT Props, const KernelType &KernelFunc, - detail::v1::KernelRuntimeInfo &KRInfo) const { + void ProcessKernelRuntimeInfo(PropertiesT Props, const KernelType &KernelFunc, + detail::v1::KernelRuntimeInfo &KRInfo) const { using LambdaArgType = sycl::detail::lambda_arg_type>; using TransformedArgType = std::conditional_t< @@ -3753,8 +3755,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KRInfo.KernelNumArgs() = detail::getKernelNumParams(); KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); - KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); - KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + KRInfo.KernelHasSpecialCaptures() = + detail::hasSpecialCaptures(); + KRInfo.KernelNameBasedCachePtr() = + detail::getKernelNameBasedCache(); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -3831,38 +3835,32 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event_impl( - nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_direct_with_event_impl( - nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_direct_with_event_impl( - nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_direct_without_event_impl( - nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_direct_without_event_impl( - nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_direct_without_event_impl( - nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; #endif //__INTEL_PREVIEW_BREAKING_CHANGES @@ -3955,9 +3953,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template event submit_direct_with_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3966,24 +3964,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, - KernelFunc, KRInfo); + ProcessKernelRuntimeInfo( + Props, KernelFunc, KRInfo); - detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); // TODO UseFallbackAssert return submit_direct_with_event_impl(Range, SI, KRInfo, - TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } template void submit_direct_without_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3992,16 +3991,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, - KernelFunc, KRInfo); + ProcessKernelRuntimeInfo( + Props, KernelFunc, KRInfo); - detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); // TODO UseFallbackAssert submit_direct_without_event_impl(Range, SI, KRInfo, - TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index f058f596f7c70..3273be7c2349e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -438,13 +438,10 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( return Args; } -detail::EventImplPtr -queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, - const v1::SubmissionInfo &SubmitInfo, - const v1::KernelRuntimeInfo &KRInfo, - bool CallerNeedsEvent, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { +detail::EventImplPtr queue_impl::submit_direct_impl( + const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { (void)SubmitInfo; std::unique_ptr CommandGroup; @@ -483,36 +480,33 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, // Barrier and un-enqueued commands synchronization for out or order queue if (!isInOrder()) { MMissedCleanupRequests.unset( - [&](MissedCleanupRequestsType &MissedCleanupRequests) { - for (auto &UpdatedGraph : MissedCleanupRequests) - doUnenqueuedCommandCleanup(UpdatedGraph); - MissedCleanupRequests.clear(); - }); + [&](MissedCleanupRequestsType &MissedCleanupRequests) { + for (auto &UpdatedGraph : MissedCleanupRequests) + doUnenqueuedCommandCleanup(UpdatedGraph); + MissedCleanupRequests.clear(); + }); - if (MDefaultGraphDeps.LastBarrier && !MDefaultGraphDeps.LastBarrier->isEnqueued()) { + if (MDefaultGraphDeps.LastBarrier && + !MDefaultGraphDeps.LastBarrier->isEnqueued()) { CGData.MEvents.push_back(MDefaultGraphDeps.LastBarrier); } } Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), - KRInfo.KernelParamDescGetter(), KRInfo.KernelNumArgs()); + KRInfo.KernelParamDescGetter(), + KRInfo.KernelNumArgs()); CommandGroup.reset(new detail::CGExecKernel( - std::move(NDRDesc), - KRInfo.HostKernel(), + std::move(NDRDesc), KRInfo.HostKernel(), nullptr, // MKernel nullptr, // MKernelBundle - std::move(CGData), - std::move(Args), - toKernelNameStrT(KRInfo.KernelName()), - KRInfo.KernelNameBasedCachePtr(), - std::move(StreamStorage), - std::move(AuxiliaryResources), - detail::CGType::Kernel, + std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), + KRInfo.KernelNameBasedCachePtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, false, // MKernelIsCooperative false, // MKernelUsesClusterLaunch - 0, // MKernelWorkGroupMemorySize + 0, // MKernelWorkGroupMemorySize CodeLoc)); CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 18ed4025e20d2..a8bba21cefc76 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -371,64 +371,61 @@ class queue_impl : public std::enable_shared_from_this { } #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - event - submit_direct_with_event(nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, - IsTopCodeLoc); + event submit_direct_with_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_direct_impl( + NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event - submit_direct_with_event(nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, - IsTopCodeLoc); + event submit_direct_with_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_direct_impl( + NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event - submit_direct_with_event(nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, - IsTopCodeLoc); + event submit_direct_with_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_direct_impl( + NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - void - submit_direct_without_event(nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + void submit_direct_without_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } - void - submit_direct_without_event(nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + void submit_direct_without_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } - void - submit_direct_without_event(nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + void submit_direct_without_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } #endif //__INTEL_PREVIEW_BREAKING_CHANGES @@ -969,8 +966,8 @@ class queue_impl : public std::enable_shared_from_this { #endif std::vector extractArgsAndReqsFromLambda( - char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), - size_t NumKernelParams); + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams); /// Performs command group submission to the queue. /// @@ -990,11 +987,11 @@ class queue_impl : public std::enable_shared_from_this { #ifdef __INTEL_PREVIEW_BREAKING_CHANGES detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, - const v1::SubmissionInfo &SubmitInfo, - const v1::KernelRuntimeInfo &KRInfo, - bool CallerNeedsEvent, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc); + const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc); #endif //__INTEL_PREVIEW_BREAKING_CHANGES /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 6817386e6c22d..e3de12950162c 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -314,57 +314,51 @@ event queue::submit_with_event_impl( #ifdef __INTEL_PREVIEW_BREAKING_CHANGES event queue::submit_direct_with_event_impl( - nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } event queue::submit_direct_with_event_impl( - nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } event queue::submit_direct_with_event_impl( - nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } void queue::submit_direct_without_event_impl( - nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } void queue::submit_direct_without_event_impl( - nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } void queue::submit_direct_without_event_impl( - nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } #endif //__INTEL_PREVIEW_BREAKING_CHANGES diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index b10ec4173e364..7dae968013948 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -182,13 +182,12 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { CGH.host_task([&] { std::unique_lock lk(CvMutex); Cv.wait(lk, [&ready] { return ready; }); - ExecutedCommands.push_back( - {CommandType::HOST_TASK, 0}); + ExecutedCommands.push_back({CommandType::HOST_TASK, 0}); }); }); - event Ev2 = InOrderQueue.parallel_for(nd_range<1>{range{32}, range{32}}, - [](nd_item<1>) {}); + event Ev2 = InOrderQueue.parallel_for( + nd_range<1>{range{32}, range{32}}, [](nd_item<1>) {}); { std::unique_lock lk(CvMutex); From 5865f3a1ae7c55ba26ec2543185eccb2f142005b Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 13 Aug 2025 10:33:07 +0000 Subject: [PATCH 15/65] Fixed #ifdef, added comment to a new function. --- sycl/source/detail/queue_impl.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index a8bba21cefc76..ce4b4f48a2db4 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -965,9 +965,11 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const SubmissionInfo &SubmitInfo); #endif +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), size_t NumKernelParams); +#endif /// Performs command group submission to the queue. /// @@ -986,6 +988,18 @@ class queue_impl : public std::enable_shared_from_this { const v1::SubmissionInfo &SubmitInfo); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES + /// Performs kernel submission to the queue. + /// + /// \param NDRDesc is an NDRange descriptor + /// \param SubmitInfo is additional optional information for the submission. + /// \param KRInfo is a descriptor of the kernel + /// \param CallerNeedsEvent is a boolean indicating whether the event is + /// required by the user after the call. + /// \param CodeLoc is the code location of the submit call + /// \param IsTopCodeLoc Used to determine if the object is in a local + /// scope or in the top level scope. + /// + /// \return a SYCL event representing submitted command group or nullptr. detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, const v1::KernelRuntimeInfo &KRInfo, From 27b3110ae28a2fda7f6d080fb700c36de5f27178 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 20 Aug 2025 15:13:49 +0000 Subject: [PATCH 16/65] Address review comments --- sycl/cmake/modules/AddSYCLUnitTest.cmake | 5 +- .../oneapi/experimental/enqueue_functions.hpp | 22 +++------ sycl/include/sycl/queue.hpp | 46 ++++++------------- sycl/source/detail/queue_impl.cpp | 4 +- sycl/source/detail/queue_impl.hpp | 8 ++-- sycl/source/queue.cpp | 4 +- sycl/unittests/xpti_trace/CMakeLists.txt | 2 +- 7 files changed, 32 insertions(+), 59 deletions(-) diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index e581dfaa61ba0..f7102b393c671 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -67,7 +67,7 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_c target_compile_definitions(${test_dirname} PRIVATE __INTEL_PREVIEW_BREAKING_CHANGES) if (${is_no_cgh}) - set(sycl_cache_suffix "_preview_no_cgh") + set(sycl_cache_suffix "_non_preview_no_cgh") else() set(sycl_cache_suffix "_preview") endif() @@ -165,7 +165,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_c -Wno-inconsistent-missing-override ) endif() - target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING) endfunction() @@ -176,6 +175,6 @@ endfunction() # Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)` macro(add_sycl_unittest test_name_prefix link_variant) add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_non_preview_no_cgh ${link_variant} FALSE TRUE ${ARGN}) add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN}) - add_sycl_unittest_internal(${test_name_prefix}_preview_no_cgh ${link_variant} TRUE TRUE ${ARGN}) endmacro() diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 95330db434e89..3ddb983393826 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -118,7 +118,7 @@ template Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event( Props, Range, KernelFunc, CodeLoc); } @@ -127,7 +127,7 @@ template Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event( Props, Range, KernelFunc, CodeLoc); } @@ -304,11 +304,11 @@ void nd_launch(handler &CGH, nd_range Range, KernelObj); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(ReductionsT) == 0) { submit(std::move(Q), empty_properties_t{}, Range, KernelObj); } else { @@ -317,18 +317,13 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, std::forward(Reductions)...); }); } -} #else -template -void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, - ReductionsT &&...Reductions) { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, std::forward(Reductions)...); }); -} #endif +} template @@ -344,11 +339,11 @@ void nd_launch(handler &CGH, std::forward(Reductions)..., KernelObj); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(ReductionsT) == 0) { ext::oneapi::experimental::detail::LaunchConfigAccess, Properties> @@ -361,18 +356,13 @@ void nd_launch(queue Q, launch_config, Properties> Config, std::forward(Reductions)...); }); } -} #else -template -void nd_launch(queue Q, launch_config, Properties> Config, - const KernelType &KernelObj, ReductionsT &&...Reductions) { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, std::forward(Reductions)...); }); -} #endif +} template void nd_launch(handler &CGH, nd_range Range, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index bd67e01834180..b1d1ce453909c 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -151,7 +151,7 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -215,7 +215,7 @@ class __SYCL_EXPORT KernelRuntimeInfo { bool MKernelHasSpecialCaptures = true; detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES } // namespace v1 } // namespace detail @@ -3283,7 +3283,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query()); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3296,8 +3295,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event( + return submit_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3306,27 +3306,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }, TlsCodeLocCapture.query()); } - } #else - /// parallel_for version with a kernel represented as a lambda + nd_range that - /// specifies global, local sizes and offset. - /// - /// \param Range specifies the global and local work spaces of the kernel - /// \param Rest acts as-if: "ReductionTypes&&... Reductions, - /// const KernelType &KernelFunc". - template - std::enable_if_t::value, event> - parallel_for(nd_range Range, RestT &&...Rest) { - constexpr detail::code_location CodeLoc = getCodeLocation(); - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Rest...); - }, - TlsCodeLocCapture.query()); - } + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); #endif + } + /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3832,7 +3820,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -3862,7 +3850,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, @@ -3908,7 +3896,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3928,14 +3916,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); - // TODO UseFallbackAssert - return submit_direct_with_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } - template void submit_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3955,8 +3941,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); - // TODO UseFallbackAssert - submit_direct_without_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 222ca53910003..d8fbf85137e2e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -403,7 +403,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, return EventImpl; } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector queue_impl::extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), @@ -515,7 +515,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( return CallerNeedsEvent ? EventImpl : nullptr; } -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES template event queue_impl::submitWithHandler(const std::vector &DepEvents, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 59024d44dd633..738a78d6213ce 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -365,7 +365,7 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -422,7 +422,7 @@ class queue_impl : public std::enable_shared_from_this { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, @@ -952,7 +952,7 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), size_t NumKernelParams); @@ -975,7 +975,7 @@ class queue_impl : public std::enable_shared_from_this { bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index bd25d703aa51d..94df8a91d7a48 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -315,7 +315,7 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event queue::submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -364,7 +364,7 @@ void queue::submit_direct_without_event_impl( IsTopCodeLoc); } -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, diff --git a/sycl/unittests/xpti_trace/CMakeLists.txt b/sycl/unittests/xpti_trace/CMakeLists.txt index 42c5505d86107..e3553d840e4c6 100644 --- a/sycl/unittests/xpti_trace/CMakeLists.txt +++ b/sycl/unittests/xpti_trace/CMakeLists.txt @@ -8,4 +8,4 @@ add_sycl_unittest(XptiTraceTests OBJECT ) target_link_libraries(XptiTraceTests_non_preview PRIVATE xpti xptitest_subscriber) target_link_libraries(XptiTraceTests_preview PRIVATE xpti xptitest_subscriber) -target_link_libraries(XptiTraceTests_preview_no_cgh PRIVATE xpti xptitest_subscriber) +target_link_libraries(XptiTraceTests_non_preview_no_cgh PRIVATE xpti xptitest_subscriber) From 9041e94a878e8ff5ccc50baf76d1edf12c16219b Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 08:25:13 +0000 Subject: [PATCH 17/65] Updated Linux symbols --- sycl/include/sycl/queue.hpp | 4 ---- sycl/source/detail/queue_impl.cpp | 4 ---- sycl/source/detail/queue_impl.hpp | 5 +---- sycl/source/queue.cpp | 3 --- sycl/test/abi/sycl_symbols_linux.dump | 8 +++++++- 5 files changed, 8 insertions(+), 16 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b1d1ce453909c..2d727b7496468 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -151,7 +151,6 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -215,7 +214,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { bool MKernelHasSpecialCaptures = true; detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES } // namespace v1 } // namespace detail @@ -3820,7 +3818,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -3850,7 +3847,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index d8fbf85137e2e..99d64d118b6a5 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -403,8 +403,6 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, return EventImpl; } -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - std::vector queue_impl::extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), size_t NumKernelParams) { @@ -515,8 +513,6 @@ detail::EventImplPtr queue_impl::submit_direct_impl( return CallerNeedsEvent ? EventImpl : nullptr; } -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES - template event queue_impl::submitWithHandler(const std::vector &DepEvents, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 738a78d6213ce..f032e709da0a0 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -365,7 +365,6 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -422,7 +421,6 @@ class queue_impl : public std::enable_shared_from_this { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, @@ -952,7 +950,6 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), size_t NumKernelParams); @@ -975,7 +972,7 @@ class queue_impl : public std::enable_shared_from_this { bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES + /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 94df8a91d7a48..ec2c5602816f9 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -315,7 +315,6 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event queue::submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -364,8 +363,6 @@ void queue::submit_direct_without_event_impl( IsTopCodeLoc); } -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES - void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index f41c07ee394b7..76d849af87c61 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3334,6 +3334,7 @@ _ZN4sycl3_V16detail21LocalAccessorBaseHost7getSizeEv _ZN4sycl3_V16detail21LocalAccessorBaseHostC1ENS0_5rangeILi3EEEiiRKNS0_13property_listE _ZN4sycl3_V16detail21LocalAccessorBaseHostC2ENS0_5rangeILi3EEEiiRKNS0_13property_listE _ZN4sycl3_V16detail22addHostAccessorAndWaitEPNS1_16AccessorImplHostE +_ZN4sycl3_V16detail22free_function_info_map3addEPKPKcPKjj _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateERKSt8functionIFbRKSt10shared_ptrINS1_17device_image_implEEEE _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKNS0_4spanIcLm18446744073709551615EEENS0_12bundle_stateE @@ -3747,7 +3748,13 @@ _ZNK4sycl3_V15queue20ext_oneapi_get_stateEv _ZNK4sycl3_V15queue22submit_with_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue25submit_without_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv +_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb _ZNK4sycl3_V15queue30ext_oneapi_get_last_event_implEv +_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue15reference_countEEENS0_6detail18is_queue_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue6deviceEEENS0_6detail18is_queue_info_descIT_E11return_typeEv @@ -3850,7 +3857,6 @@ _ZNK4sycl3_V16detail19kernel_bundle_plain5emptyEv _ZNK4sycl3_V16detail21LocalAccessorBaseHost11getPropListEv _ZNK4sycl3_V16detail21LocalAccessorBaseHost6getPtrEv _ZNK4sycl3_V16detail21LocalAccessorBaseHost7getSizeEv -_ZN4sycl3_V16detail22free_function_info_map3addEPKPKcPKjj _ZNK4sycl3_V16detail28SampledImageAccessorBaseHost10getSamplerEv _ZNK4sycl3_V16detail28SampledImageAccessorBaseHost11getPropListEv _ZNK4sycl3_V16detail28SampledImageAccessorBaseHost12getNumOfDimsEv From ac2c5bb05206f0dc3b50a8b63c4b0466aaa6ca80 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 11:23:04 +0000 Subject: [PATCH 18/65] Addressed more review comments --- .../ext/oneapi/experimental/enqueue_functions.hpp | 12 ++---------- sycl/include/sycl/khr/free_function_commands.hpp | 6 +++--- sycl/include/sycl/queue.hpp | 10 ---------- 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 3ddb983393826..fa3d51da5ec1c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -112,14 +112,12 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, Props, detail::type_erased_cgfo_ty{CGF}, CodeLoc); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event( + Q.submit_direct_without_event( Props, Range, KernelFunc, CodeLoc); } template Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event( + return Q.submit_direct_with_event( Props, Range, KernelFunc, CodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail template @@ -149,7 +145,6 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, submit(Q, empty_properties_t{}, std::forward(CGF), CodeLoc); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void submit(const queue &Q, PropertiesT Props, nd_range Range, @@ -160,7 +155,6 @@ void submit(const queue &Q, PropertiesT Props, nd_range Range, KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, CodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template event submit_with_event(const queue &Q, PropertiesT Props, @@ -179,7 +173,6 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, std::forward(CGF), CodeLoc); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, @@ -190,7 +183,6 @@ event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, CodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void single_task(handler &CGH, const KernelType &KernelObj) { diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 13ba0c8f56454..4dd7c067126a1 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -154,7 +154,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + submit(q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size), k); #else submit( @@ -168,7 +168,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + submit(q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size), k); #else submit( @@ -182,7 +182,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + submit(q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size), k); #else submit( diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 2d727b7496468..7d9716a6d9d4a 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -234,7 +234,6 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, @@ -247,7 +246,6 @@ event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail } // namespace ext::oneapi::experimental @@ -3687,7 +3685,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template friend void ext::oneapi::experimental::detail::submit_direct_impl( @@ -3700,8 +3697,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3715,7 +3710,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template struct TransformUserItemType { using type = std::conditional_t< std::is_convertible_v, LambdaArgType>, nd_item, @@ -3745,7 +3739,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. @@ -3890,8 +3883,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template event submit_direct_with_event(PropertiesT Props, nd_range Range, @@ -3941,7 +3932,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. From 8e155fb23a40d70c54ac066f1c64fe67f2cae68e Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 11:24:40 +0000 Subject: [PATCH 19/65] Fix formatting --- sycl/include/sycl/queue.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 7d9716a6d9d4a..53d7c6a24f0e6 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3303,11 +3303,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query()); } #else - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Rest...); - }, - TlsCodeLocCapture.query()); + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); #endif } @@ -3883,8 +3883,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template + template event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = @@ -3908,8 +3908,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template + template void submit_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = From 502f63733a684972dc8444ccd259400bb57e7521 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 12:20:44 +0000 Subject: [PATCH 20/65] Fix formatting, remove unused properties argument --- .../sycl/ext/oneapi/experimental/enqueue_functions.hpp | 2 +- sycl/include/sycl/queue.hpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index fa3d51da5ec1c..4b47c3da87a41 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -160,7 +160,7 @@ template event submit_with_event(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { + sycl::detail::code_location::current()) { return sycl::ext::oneapi::experimental::detail::submit_with_event_impl( Q, Props, std::forward(CGF), CodeLoc); } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 53d7c6a24f0e6..171a05549a70c 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3717,9 +3717,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; - template - void ProcessKernelRuntimeInfo(PropertiesT Props, const KernelType &KernelFunc, + template + void ProcessKernelRuntimeInfo(const KernelType &KernelFunc, detail::v1::KernelRuntimeInfo &KRInfo) const { using LambdaArgType = sycl::detail::lambda_arg_type>; From d708c9353355e6b11ac3f904585d75bdedec16b2 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 13:18:49 +0000 Subject: [PATCH 21/65] Fix ProcessKernelRuntimeInfo call --- sycl/include/sycl/queue.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 171a05549a70c..36101e2380362 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3896,8 +3896,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo( - Props, KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(Props, KernelFunc, + KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); @@ -3921,8 +3921,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo( - Props, KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(Props, KernelFunc, + KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); From e9f6e4e347158d621911c2dea663ec54837b30c6 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 13:51:55 +0000 Subject: [PATCH 22/65] Fix unit test build and ProcessKernelRuntimeInfo calls --- sycl/cmake/modules/AddSYCLUnitTest.cmake | 20 ++++++++++---------- sycl/include/sycl/queue.hpp | 6 ++---- sycl/unittests/xpti_trace/CMakeLists.txt | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index f7102b393c671..aeede07acdead 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -34,11 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_c # Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its # parent scope. if (${is_preview}) - if (${is_no_cgh}) - set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/PreviewNoCGH") - else() set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview") - endif() + endif() + + if (${is_no_cgh}) + set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/NoCGH") endif() if ("${link_variant}" MATCHES "SHARED") @@ -66,11 +66,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_c if (${is_preview}) target_compile_definitions(${test_dirname} PRIVATE __INTEL_PREVIEW_BREAKING_CHANGES) - if (${is_no_cgh}) - set(sycl_cache_suffix "_non_preview_no_cgh") - else() - set(sycl_cache_suffix "_preview") - endif() + set(sycl_cache_suffix "_preview") + endif() + + if (${is_no_cgh}) + set(sycl_cache_suffix "_no_cgh") endif() if (${is_no_cgh}) @@ -175,6 +175,6 @@ endfunction() # Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)` macro(add_sycl_unittest test_name_prefix link_variant) add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN}) - add_sycl_unittest_internal(${test_name_prefix}_non_preview_no_cgh ${link_variant} FALSE TRUE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_no_cgh ${link_variant} FALSE TRUE ${ARGN}) add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN}) endmacro() diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 36101e2380362..c04d28af0631e 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3896,8 +3896,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, KernelFunc, - KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); @@ -3921,8 +3920,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, KernelFunc, - KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); diff --git a/sycl/unittests/xpti_trace/CMakeLists.txt b/sycl/unittests/xpti_trace/CMakeLists.txt index e3553d840e4c6..0fbb3be4046fc 100644 --- a/sycl/unittests/xpti_trace/CMakeLists.txt +++ b/sycl/unittests/xpti_trace/CMakeLists.txt @@ -8,4 +8,4 @@ add_sycl_unittest(XptiTraceTests OBJECT ) target_link_libraries(XptiTraceTests_non_preview PRIVATE xpti xptitest_subscriber) target_link_libraries(XptiTraceTests_preview PRIVATE xpti xptitest_subscriber) -target_link_libraries(XptiTraceTests_non_preview_no_cgh PRIVATE xpti xptitest_subscriber) +target_link_libraries(XptiTraceTests_no_cgh PRIVATE xpti xptitest_subscriber) From 057a7a58bea78f8ff156e8ff9530ccfdf4959bcb Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 16:18:02 +0000 Subject: [PATCH 23/65] Fix formatting --- sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 4b47c3da87a41..300cb003978de 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -120,6 +120,7 @@ void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, Q.submit_direct_without_event( Props, Range, KernelFunc, CodeLoc); } + template event submit_direct_with_event_impl( From 77d92ca0a18d05d1820ee9788544b667b9a02db6 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 22 Aug 2025 14:55:57 +0000 Subject: [PATCH 24/65] Added single_task shortcut function support for no-handler --- sycl/include/sycl/queue.hpp | 37 +++++++++++++------ .../Extensions/CommandGraph/CommandGraph.cpp | 3 ++ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index c04d28af0631e..8ebf40b56bfce 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2738,12 +2738,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + return submit_direct_with_event( + ext::oneapi::experimental::empty_properties_t{}, nd_range<1>{1, 1}, + KernelFunc); + +#else return submit( [&](handler &CGH) { CGH.template single_task( Properties, KernelFunc); }, TlsCodeLocCapture.query()); +#endif + } /// single_task version with a kernel represented as a lambda. @@ -3293,7 +3302,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event( + return submit_direct_with_event>( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3717,18 +3727,22 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; - template + template void ProcessKernelRuntimeInfo(const KernelType &KernelFunc, detail::v1::KernelRuntimeInfo &KRInfo) const { using LambdaArgType = sycl::detail::lambda_arg_type>; - using TransformedArgType = std::conditional_t< + using TransformedArgType = std::conditional_t::value && Dims == 1, item, - typename TransformUserItemType::type>; + typename TransformUserItemType::type>, + void>; KRInfo.HostKernel().reset( - new detail::HostKernel( - KernelFunc)); + new detail::HostKernel( + KernelFunc)); + KRInfo.KernelName() = detail::getKernelName(); KRInfo.KernelNumArgs() = detail::getKernelNumParams(); KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); @@ -3882,7 +3896,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3896,10 +3911,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); - detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); + detail::KernelWrapper::wrap(KernelFunc); return submit_direct_with_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), @@ -3920,7 +3935,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); diff --git a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp index 82fc0ed56c09d..662eef6186683 100644 --- a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp @@ -619,6 +619,8 @@ TEST_F(CommandGraphTest, AccessorModeEdges) { // Tests the transitive queue recording behaviour with queue shortcuts. TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) { +// Graphs not supported yet for the no-handler submit path +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT device Dev; context Ctx{{Dev}}; queue Q1{Ctx, Dev}; @@ -662,6 +664,7 @@ TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) { ext::oneapi::experimental::queue_state::executing); ASSERT_EQ(Q3.ext_oneapi_get_state(), ext::oneapi::experimental::queue_state::executing); +#endif } // Tests that dynamic_work_group_memory.get() will throw on the host side. From 85aaa5c3fefca6aba36ecb3dcdaf9fc862e486dc Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 25 Aug 2025 07:47:49 +0000 Subject: [PATCH 25/65] Fix formatting --- sycl/include/sycl/queue.hpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 8ebf40b56bfce..b5a28f2f2de6f 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2752,7 +2752,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }, TlsCodeLocCapture.query()); #endif - } /// single_task version with a kernel represented as a lambda. @@ -3302,8 +3301,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event>( + return submit_direct_with_event>( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3733,15 +3732,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::v1::KernelRuntimeInfo &KRInfo) const { using LambdaArgType = sycl::detail::lambda_arg_type>; - using TransformedArgType = std::conditional_t::value && Dims == 1, item, - typename TransformUserItemType::type>, + std::is_integral::value && Dims == 1, item, + typename TransformUserItemType::type>, void>; KRInfo.HostKernel().reset( - new detail::HostKernel( - KernelFunc)); + new detail::HostKernel( + KernelFunc)); KRInfo.KernelName() = detail::getKernelName(); KRInfo.KernelNumArgs() = detail::getKernelNumParams(); @@ -3911,10 +3911,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, + KRInfo); - detail::KernelWrapper::wrap(KernelFunc); + detail::KernelWrapper::wrap(KernelFunc); return submit_direct_with_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), @@ -3935,7 +3936,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); From 967d35e7ea1a815f874f2d2cbf0eea2d85947c5f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 29 Aug 2025 08:47:41 +0000 Subject: [PATCH 26/65] Update KernelRuntimeInfo - change kernel name based cache pointer to DeviceKernelInfo pointer --- sycl/include/sycl/queue.hpp | 13 ++++++------- sycl/source/detail/queue_impl.cpp | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b5a28f2f2de6f..96f5902cc2108 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -198,11 +198,11 @@ class __SYCL_EXPORT KernelRuntimeInfo { return MKernelHasSpecialCaptures; } - detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { - return MKernelNameBasedCachePtr; + detail::DeviceKernelInfo *&DeviceKernelInfoPtr() { + return MDeviceKernelInfoPtr; } - detail::KernelNameBasedCacheT *KernelNameBasedCachePtr() const { - return MKernelNameBasedCachePtr; + detail::DeviceKernelInfo *DeviceKernelInfoPtr() const { + return MDeviceKernelInfoPtr; } private: @@ -212,7 +212,7 @@ class __SYCL_EXPORT KernelRuntimeInfo { KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; bool MKernelIsESIMD = false; bool MKernelHasSpecialCaptures = true; - detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; + detail::DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; }; } // namespace v1 @@ -3749,8 +3749,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); - KRInfo.KernelNameBasedCachePtr() = - detail::getKernelNameBasedCache(); + KRInfo.DeviceKernelInfoPtr() = detail::getDeviceKernelInfo(); } #ifndef __INTEL_PREVIEW_BREAKING_CHANGES diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index e763ecdfc5ddc..6c2454b6ddea8 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -510,7 +510,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( nullptr, // MKernel nullptr, // MKernelBundle std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), - KRInfo.KernelNameBasedCachePtr(), std::move(StreamStorage), + *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, false, // MKernelIsCooperative From ec1ef896502ed3def2cda123be90b1a48933dbba Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 29 Aug 2025 09:36:09 +0000 Subject: [PATCH 27/65] Switch to DeviceKernelInfo use on the no-handler path --- sycl/include/sycl/queue.hpp | 29 +---------------------------- sycl/source/detail/queue_impl.cpp | 11 ++++++----- 2 files changed, 7 insertions(+), 33 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 96f5902cc2108..84ba701878628 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -180,24 +180,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } - int &KernelNumArgs() { return MKernelNumArgs; } - const int &KernelNumArgs() const { return MKernelNumArgs; } - - KernelParamDescGetterFuncPtr &KernelParamDescGetter() { - return MKernelParamDescGetter; - } - const KernelParamDescGetterFuncPtr &KernelParamDescGetter() const { - return MKernelParamDescGetter; - } - - bool &KernelIsESIMD() { return MKernelIsESIMD; } - const bool &KernelIsESIMD() const { return MKernelIsESIMD; } - - bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } - const bool &KernelHasSpecialCaptures() const { - return MKernelHasSpecialCaptures; - } - detail::DeviceKernelInfo *&DeviceKernelInfoPtr() { return MDeviceKernelInfoPtr; } @@ -208,10 +190,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { private: detail::ABINeutralKernelNameStrT MKernelName; std::shared_ptr MHostKernel; - int MKernelNumArgs = 0; - KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; - bool MKernelIsESIMD = false; - bool MKernelHasSpecialCaptures = true; detail::DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; }; @@ -3744,12 +3722,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KernelFunc)); KRInfo.KernelName() = detail::getKernelName(); - KRInfo.KernelNumArgs() = detail::getKernelNumParams(); - KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); - KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); - KRInfo.KernelHasSpecialCaptures() = - detail::hasSpecialCaptures(); - KRInfo.DeviceKernelInfoPtr() = detail::getDeviceKernelInfo(); + KRInfo.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); } #ifndef __INTEL_PREVIEW_BREAKING_CHANGES diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6c2454b6ddea8..521366fcb9ad2 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -465,7 +465,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( assert(!hasCommandGraph()); // No special captures supported yet for the no-handler path - assert(!KRInfo.KernelHasSpecialCaptures()); + assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. @@ -501,15 +501,16 @@ detail::EventImplPtr queue_impl::submit_direct_impl( } } - Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), - KRInfo.KernelParamDescGetter(), - KRInfo.KernelNumArgs()); + Args = extractArgsAndReqsFromLambda( + KRInfo.GetKernelFuncPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, + KRInfo.DeviceKernelInfoPtr()->NumParams); CommandGroup.reset(new detail::CGExecKernel( std::move(NDRDesc), KRInfo.HostKernel(), nullptr, // MKernel nullptr, // MKernelBundle - std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), + std::move(CGData), std::move(Args), + toKernelNameStrT(KRInfo.DeviceKernelInfoPtr()->Name), *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, From 1f95b9b5fd1f0bce2476f20b8678092b07db0d84 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 29 Aug 2025 12:21:44 +0000 Subject: [PATCH 28/65] KernelName fix --- sycl/source/detail/queue_impl.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 521366fcb9ad2..54f0dbe2907b9 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -509,8 +509,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( std::move(NDRDesc), KRInfo.HostKernel(), nullptr, // MKernel nullptr, // MKernelBundle - std::move(CGData), std::move(Args), - toKernelNameStrT(KRInfo.DeviceKernelInfoPtr()->Name), + std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, From 12ef6dadc5339381f80e21433ea43c822c7b5ea3 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 29 Aug 2025 15:09:42 +0000 Subject: [PATCH 29/65] Update Windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index eae017c88eac8..bf5343a2c0dc7 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -298,6 +298,7 @@ ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@V?$id@$02@23@V?$range@$02@23@1W4mode@access@23@PEAXHH_K_NAEBVproperty_list@23@@Z ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@V?$id@$02@23@V?$range@$02@23@1W4mode@access@23@PEAXHH_N_K4AEBVproperty_list@23@@Z ??0HostProfilingInfo@detail@_V1@sycl@@QEAA@XZ +??0KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAA@XZ ??0LocalAccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VLocalAccessorImplHost@detail@_V1@sycl@@@std@@@Z ??0LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z @@ -507,6 +508,7 @@ ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@AEBUcode_location@123@@Z ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@XZ ??1AccessorBaseHost@detail@_V1@sycl@@QEAA@XZ +??1KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAA@XZ ??1LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ ??1SYCLCategory@detail@_V1@sycl@@UEAA@XZ ??1SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ @@ -710,6 +712,8 @@ ??_Fqueue@_V1@sycl@@QEAAXXZ ?AccessTargetMask@handler@_V1@sycl@@0HB ?Clear@exception_list@_V1@sycl@@AEAAXXZ +?DeviceKernelInfoPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAPEAVDeviceKernelInfo@345@XZ +?DeviceKernelInfoPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAPEAVDeviceKernelInfo@345@XZ ?DirSep@OSUtil@detail@_V1@sycl@@2QEBDEB ?DisableRangeRounding@handler@_V1@sycl@@AEAA_NXZ ?EventMode@SubmissionInfo@detail@_V1@sycl@@QEAAAEAW4event_mode_enum@experimental@oneapi@ext@34@XZ @@ -718,8 +722,14 @@ ?EventMode@SubmissionInfo@v1@detail@_V1@sycl@@QEBAAEBW4event_mode_enum@experimental@oneapi@ext@45@XZ ?GDBMethodsAnchor@SampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ ?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ +?GetKernelFuncPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAPEADXZ +?GetKernelFuncPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAPEADXZ ?GetRangeRoundingSettings@handler@_V1@sycl@@AEAAXAEA_K00@Z ?HasAssociatedAccessor@handler@_V1@sycl@@AEBA_NPEAVAccessorImplHost@detail@23@W4target@access@23@@Z +?HostKernel@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@XZ +?HostKernel@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAAEBV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@XZ +?KernelName@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAVstring@345@XZ +?KernelName@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAAEBVstring@345@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@v1@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@345@XZ @@ -4464,6 +4474,12 @@ ?storeRawArg@handler@_V1@sycl@@AEAAPEAXAEBVraw_kernel_arg@experimental@oneapi@ext@23@@Z ?storeRawArg@handler@_V1@sycl@@AEAAPEAXPEBX_K@Z ?stringifyErrorCode@detail@_V1@sycl@@YAPEBDH@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$00@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$01@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$02@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$00@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$01@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$02@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@@Z From 29805311b105a95c23bbd151f0559ad34e4fc814 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 1 Sep 2025 15:39:45 +0000 Subject: [PATCH 30/65] Split the kernel submit code into a command submission lambda and generic part. Added "kernel" to the direct submission function names. --- .../oneapi/experimental/enqueue_functions.hpp | 12 ++-- sycl/include/sycl/queue.hpp | 48 ++++++------- sycl/source/detail/queue_impl.cpp | 72 ++++++++++--------- sycl/source/detail/queue_impl.hpp | 44 ++++++------ sycl/source/queue.cpp | 36 +++++----- 5 files changed, 107 insertions(+), 105 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 300cb003978de..f59315b6795fc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -114,19 +114,19 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, template -void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event( + Q.submit_kernel_direct_without_event( Props, Range, KernelFunc, CodeLoc); } template -event submit_direct_with_event_impl( +event submit_kernel_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event( + return Q.submit_kernel_direct_with_event( Props, Range, KernelFunc, CodeLoc); } } // namespace detail @@ -152,7 +152,7 @@ void submit(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - sycl::ext::oneapi::experimental::detail::submit_direct_impl< + sycl::ext::oneapi::experimental::detail::submit_kernel_direct_impl< KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, CodeLoc); } @@ -180,7 +180,7 @@ event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl< + return sycl::ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl< KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, CodeLoc); } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 84ba701878628..d273e53058a9f 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -214,13 +214,13 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, template -void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); template -event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, +event submit_kernel_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); @@ -2718,7 +2718,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - return submit_direct_with_event( + return submit_kernel_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, nd_range<1>{1, 1}, KernelFunc); @@ -3279,7 +3279,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event>( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { @@ -3674,13 +3674,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - friend void ext::oneapi::experimental::detail::submit_direct_impl( + friend void ext::oneapi::experimental::detail::submit_kernel_direct_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); template - friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( + friend event ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); @@ -3796,33 +3796,33 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_direct_with_event_impl( - nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event_impl( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_direct_with_event_impl( - nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event_impl( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_direct_with_event_impl( - nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event_impl( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_direct_without_event_impl( - nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event_impl( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_direct_without_event_impl( - nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event_impl( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_direct_without_event_impl( - nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event_impl( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; @@ -3871,50 +3871,46 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - event submit_direct_with_event(PropertiesT Props, nd_range Range, + event submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; using NameT = typename detail::get_kernel_name_t::name; - ProcessSubmitProperties(Props, SI); ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper::wrap(KernelFunc); - return submit_direct_with_event_impl(Range, SI, KRInfo, + return submit_kernel_direct_with_event_impl(Range, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } template - void submit_direct_without_event(PropertiesT Props, nd_range Range, + void submit_kernel_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; using NameT = typename detail::get_kernel_name_t::name; - ProcessSubmitProperties(Props, SI); ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); - submit_direct_without_event_impl(Range, SI, KRInfo, + submit_kernel_direct_without_event_impl(Range, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 54f0dbe2907b9..78a31465cfaf3 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -447,26 +447,54 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( return Args; } -detail::EventImplPtr queue_impl::submit_direct_impl( - const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, +detail::EventImplPtr queue_impl::submit_kernel_direct_impl( + const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - (void)SubmitInfo; - std::unique_ptr CommandGroup; - detail::CG::StorageInitHelper CGData; - std::vector Args; - std::vector> StreamStorage; - std::vector> AuxiliaryResources; + // No special captures supported yet for the no-handler path + assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); + + SubmitCommandFuncType SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { + std::unique_ptr CommandGroup; + std::vector Args; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + Args = extractArgsAndReqsFromLambda( + KRInfo.GetKernelFuncPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, + KRInfo.DeviceKernelInfoPtr()->NumParams); + + CommandGroup.reset(new detail::CGExecKernel( + std::move(NDRDesc), KRInfo.HostKernel(), + nullptr, // MKernel + nullptr, // MKernelBundle + std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), + *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), *this, CallerNeedsEvent); + return EventImpl; + }; + return submit_generic_direct(CallerNeedsEvent, SubmitKernelFunc); +} + +detail::EventImplPtr queue_impl::submit_generic_direct( + bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc) { + detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); // Graphs are not supported yet for the no-handler path assert(!hasCommandGraph()); - // No special captures supported yet for the no-handler path - assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); - // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. MNoLastEventMode.store(false, std::memory_order_relaxed); @@ -501,27 +529,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( } } - Args = extractArgsAndReqsFromLambda( - KRInfo.GetKernelFuncPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, - KRInfo.DeviceKernelInfoPtr()->NumParams); - - CommandGroup.reset(new detail::CGExecKernel( - std::move(NDRDesc), KRInfo.HostKernel(), - nullptr, // MKernel - nullptr, // MKernelBundle - std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), - *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), - std::move(AuxiliaryResources), detail::CGType::Kernel, - UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // MKernelIsCooperative - false, // MKernelUsesClusterLaunch - 0, // MKernelWorkGroupMemorySize - CodeLoc)); - - CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; - - EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), *this, CallerNeedsEvent); + EventImplPtr EventImpl = SubmitCommandFunc(CGData); // Sync with the last event for in order queue if (isInOrder() && !EventImpl->isDiscarded()) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index d50ce85a15f81..aa59300009842 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -366,60 +366,54 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } - event submit_direct_with_event(nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event(nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_direct_impl( - NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event submit_direct_with_event(nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event(nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_direct_impl( - NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event submit_direct_with_event(nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event(nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_direct_impl( - NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - void submit_direct_without_event(nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event(nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, IsTopCodeLoc); } - void submit_direct_without_event(nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event(nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, IsTopCodeLoc); } - void submit_direct_without_event(nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event(nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, IsTopCodeLoc); } @@ -965,13 +959,17 @@ class queue_impl : public std::enable_shared_from_this { /// scope or in the top level scope. /// /// \return a SYCL event representing submitted command group or nullptr. - detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, - const v1::SubmissionInfo &SubmitInfo, + detail::EventImplPtr submit_kernel_direct_impl(const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); +using SubmitCommandFuncType = std::function; + + detail::EventImplPtr submit_generic_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); + /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index ec2c5602816f9..273731e4c477b 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -315,51 +315,51 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_direct_with_event_impl( - nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, +event queue::submit_kernel_direct_with_event_impl( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_direct_with_event_impl( - nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, +event queue::submit_kernel_direct_with_event_impl( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_direct_with_event_impl( - nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, +event queue::submit_kernel_direct_with_event_impl( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_direct_without_event_impl( - nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, +void queue::submit_kernel_direct_without_event_impl( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_direct_without_event_impl( - nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, +void queue::submit_kernel_direct_without_event_impl( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_direct_without_event_impl( - nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, +void queue::submit_kernel_direct_without_event_impl( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } From 01e0f9f7f5510a66990831c8bad8e4ddedbf3a28 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 2 Sep 2025 10:13:31 +0000 Subject: [PATCH 31/65] Fix formatting --- .../oneapi/experimental/enqueue_functions.hpp | 21 +++--- sycl/include/sycl/queue.hpp | 67 +++++++++---------- sycl/source/detail/queue_impl.cpp | 33 +++++---- sycl/source/detail/queue_impl.hpp | 65 +++++++++--------- sycl/source/queue.cpp | 30 ++++----- 5 files changed, 106 insertions(+), 110 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index f59315b6795fc..096e2d772dade 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -114,11 +114,12 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, template -void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc) { - Q.submit_kernel_direct_without_event( - Props, Range, KernelFunc, CodeLoc); +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + Q.submit_kernel_direct_without_event(Props, Range, KernelFunc, CodeLoc); } template Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_kernel_direct_with_event( - Props, Range, KernelFunc, CodeLoc); + return Q.submit_kernel_direct_with_event(Props, Range, KernelFunc, + CodeLoc); } } // namespace detail @@ -180,8 +182,9 @@ event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl< - KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, + return sycl::ext::oneapi::experimental::detail:: + submit_kernel_direct_with_event_impl(Q, Props, Range, KernelFunc, CodeLoc); } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index d273e53058a9f..8e1803be9bbe7 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -214,16 +214,16 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, template -void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); template -event submit_kernel_direct_with_event_impl(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); +event submit_kernel_direct_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); } // namespace detail } // namespace ext::oneapi::experimental @@ -2718,7 +2718,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - return submit_kernel_direct_with_event( + return submit_kernel_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, nd_range<1>{1, 1}, KernelFunc); @@ -3279,8 +3280,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_kernel_direct_with_event>( + return submit_kernel_direct_with_event>( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3680,7 +3681,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - friend event ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl( + friend event + ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); @@ -3797,33 +3799,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; /// A template-free version of submit_without_event as const member function. @@ -3871,10 +3867,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - event submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { + event + submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3888,16 +3885,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT>::wrap(KernelFunc); return submit_kernel_direct_with_event_impl(Range, KRInfo, - TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } template - void submit_kernel_direct_without_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { + void submit_kernel_direct_without_event( + PropertiesT Props, nd_range Range, const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3911,8 +3908,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { sycl::nd_item, PropertiesT>::wrap(KernelFunc); submit_kernel_direct_without_event_impl(Range, KRInfo, - TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } /// Submits a command group function object to the queue, in order to be diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 78a31465cfaf3..f725bded7484c 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -448,47 +448,54 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( } detail::EventImplPtr queue_impl::submit_kernel_direct_impl( - const NDRDescT &NDRDesc, - const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { // No special captures supported yet for the no-handler path assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); - SubmitCommandFuncType SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { + SubmitCommandFuncType SubmitKernelFunc = + [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { std::unique_ptr CommandGroup; std::vector Args; std::vector> StreamStorage; std::vector> AuxiliaryResources; Args = extractArgsAndReqsFromLambda( - KRInfo.GetKernelFuncPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, - KRInfo.DeviceKernelInfoPtr()->NumParams); + KRInfo.GetKernelFuncPtr(), + KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, + KRInfo.DeviceKernelInfoPtr()->NumParams); CommandGroup.reset(new detail::CGExecKernel( std::move(NDRDesc), KRInfo.HostKernel(), nullptr, // MKernel nullptr, // MKernelBundle - std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), - *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), - std::move(AuxiliaryResources), detail::CGType::Kernel, - UR_KERNEL_CACHE_CONFIG_DEFAULT, + std::move(CGData), std::move(Args), + toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), + std::move(StreamStorage), std::move(AuxiliaryResources), + detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, false, // MKernelIsCooperative false, // MKernelUsesClusterLaunch 0, // MKernelWorkGroupMemorySize CodeLoc)); CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + // TODO DiscardEvent should include a check for requirements list + // once accessors are implemented + bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), *this, CallerNeedsEvent); + std::move(CommandGroup), *this, !DiscardEvent); return EventImpl; }; return submit_generic_direct(CallerNeedsEvent, SubmitKernelFunc); } -detail::EventImplPtr queue_impl::submit_generic_direct( - bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc) { +detail::EventImplPtr +queue_impl::submit_generic_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc) { detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index aa59300009842..7a3bfc55fde6c 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -366,55 +366,49 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } - event submit_kernel_direct_with_event(nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + event submit_kernel_direct_with_event( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event submit_kernel_direct_with_event(nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + event submit_kernel_direct_with_event( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event submit_kernel_direct_with_event(nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + event submit_kernel_direct_with_event( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - void submit_kernel_direct_without_event(nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + void submit_kernel_direct_without_event( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } - void submit_kernel_direct_without_event(nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + void submit_kernel_direct_without_event( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } - void submit_kernel_direct_without_event(nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + void submit_kernel_direct_without_event( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -959,16 +953,17 @@ class queue_impl : public std::enable_shared_from_this { /// scope or in the top level scope. /// /// \return a SYCL event representing submitted command group or nullptr. - detail::EventImplPtr submit_kernel_direct_impl(const NDRDescT &NDRDesc, - const v1::KernelRuntimeInfo &KRInfo, - bool CallerNeedsEvent, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc); + detail::EventImplPtr submit_kernel_direct_impl( + const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); -using SubmitCommandFuncType = std::function; + using SubmitCommandFuncType = + std::function; - detail::EventImplPtr submit_generic_direct(bool CallerNeedsEvent, - SubmitCommandFuncType &SubmitCommandFunc); + detail::EventImplPtr + submit_generic_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 273731e4c477b..26fca0c4ec44c 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -316,51 +316,45 @@ event queue::submit_with_event_impl( } event queue::submit_kernel_direct_with_event_impl( - nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } event queue::submit_kernel_direct_with_event_impl( - nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } event queue::submit_kernel_direct_with_event_impl( - nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void queue::submit_without_event_impl( From 63d1345842b55b27fcc878c7897193e0a0c83972 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 2 Sep 2025 10:17:49 +0000 Subject: [PATCH 32/65] Rename submit_generic_direct to submit_direct --- sycl/source/detail/queue_impl.cpp | 6 +++--- sycl/source/detail/queue_impl.hpp | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index f725bded7484c..6838d43d108b2 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -490,12 +490,12 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( return EventImpl; }; - return submit_generic_direct(CallerNeedsEvent, SubmitKernelFunc); + return submit_direct(CallerNeedsEvent, SubmitKernelFunc); } detail::EventImplPtr -queue_impl::submit_generic_direct(bool CallerNeedsEvent, - SubmitCommandFuncType &SubmitCommandFunc) { +queue_impl::submit_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc) { detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 7a3bfc55fde6c..85656daf63d26 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -961,9 +961,8 @@ class queue_impl : public std::enable_shared_from_this { using SubmitCommandFuncType = std::function; - detail::EventImplPtr - submit_generic_direct(bool CallerNeedsEvent, - SubmitCommandFuncType &SubmitCommandFunc); + detail::EventImplPtr submit_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. From 4001fea5a37360b6a42a28371e6ffedccd99311b Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 4 Sep 2025 08:30:30 +0000 Subject: [PATCH 33/65] Fix unused Props argument --- sycl/include/sycl/queue.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 8e1803be9bbe7..5cad1ff21d1b5 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3872,6 +3872,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { + (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3895,6 +3896,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { + (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::KernelRuntimeInfo KRInfo{}; From 6c9525be8664df75073b603e5b4dae620c803902 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 4 Sep 2025 10:04:30 +0000 Subject: [PATCH 34/65] Update Linux symbols --- sycl/test/abi/sycl_symbols_linux.dump | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index cb5bdfddfc416..28829700907fa 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3750,13 +3750,13 @@ _ZNK4sycl3_V15queue20ext_oneapi_get_stateEv _ZNK4sycl3_V15queue22submit_with_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue25submit_without_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv -_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb _ZNK4sycl3_V15queue30ext_oneapi_get_last_event_implEv -_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue15reference_countEEENS0_6detail18is_queue_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue6deviceEEENS0_6detail18is_queue_info_descIT_E11return_typeEv From 18df56b9044547eacbbeeb3ad449ec06c1e524c1 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 16 Sep 2025 13:33:49 +0000 Subject: [PATCH 35/65] Define the SubmitCommandFuncType template type and rebase --- sycl/include/sycl/queue.hpp | 7 ------- sycl/source/detail/queue_impl.cpp | 5 +++-- sycl/source/detail/queue_impl.hpp | 4 +--- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a86e34c57ed5b..3af78be6c8a4e 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -156,11 +156,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; - detail::ABINeutralKernelNameStrT &KernelName() { return MKernelName; } - const detail::ABINeutralKernelNameStrT &KernelName() const { - return MKernelName; - } - std::shared_ptr &HostKernel() { return MHostKernel; } const std::shared_ptr &HostKernel() const { return MHostKernel; @@ -177,7 +172,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { } private: - detail::ABINeutralKernelNameStrT MKernelName; std::shared_ptr MHostKernel; detail::DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; }; @@ -3714,7 +3708,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { new detail::HostKernel( KernelFunc)); - KRInfo.KernelName() = detail::getKernelName(); KRInfo.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 290ef21c37185..74282145c1428 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -446,7 +446,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( // No special captures supported yet for the no-handler path assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); - SubmitCommandFuncType SubmitKernelFunc = + auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { std::unique_ptr CommandGroup; std::vector Args; @@ -463,7 +463,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( nullptr, // MKernel nullptr, // MKernelBundle std::move(CGData), std::move(Args), - toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), + *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, false, // MKernelIsCooperative @@ -484,6 +484,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( return submit_direct(CallerNeedsEvent, SubmitKernelFunc); } +template detail::EventImplPtr queue_impl::submit_direct(bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 106bbceee18de..f2dd6212478e5 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -936,9 +936,7 @@ class queue_impl : public std::enable_shared_from_this { bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); - using SubmitCommandFuncType = - std::function; - + template detail::EventImplPtr submit_direct(bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc); From 3375e77e93c19752ceb7f982d712168ce5193b34 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 17 Sep 2025 10:55:30 +0000 Subject: [PATCH 36/65] Use the KernelData structure in the no-handler path --- sycl/source/detail/queue_impl.cpp | 51 ++++++++++--------- sycl/source/detail/queue_impl.hpp | 17 ++++++- .../FreeFunctionEventsHelpers.hpp | 12 +++++ 3 files changed, 55 insertions(+), 25 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 74282145c1428..313a47cb56a46 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -443,42 +443,45 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - // No special captures supported yet for the no-handler path - assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); + KernelData KData; + + KData.setDeviceKernelInfoPtr(KRInfo.DeviceKernelInfoPtr()); + KData.setKernelFunc(KRInfo.GetKernelFuncPtr()); + KData.setNDRDesc(NDRDesc); + + return submit_kernel_direct_impl(KData, KRInfo.HostKernel(), CallerNeedsEvent, + CodeLoc, IsTopCodeLoc); +} + +detail::EventImplPtr queue_impl::submit_kernel_direct_impl( + KernelData &KData, std::shared_ptr HostKernel, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { std::unique_ptr CommandGroup; - std::vector Args; std::vector> StreamStorage; std::vector> AuxiliaryResources; - Args = extractArgsAndReqsFromLambda( - KRInfo.GetKernelFuncPtr(), - KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, - KRInfo.DeviceKernelInfoPtr()->NumParams); + KData.extractArgsAndReqsFromLambda(); CommandGroup.reset(new detail::CGExecKernel( - std::move(NDRDesc), KRInfo.HostKernel(), - nullptr, // MKernel - nullptr, // MKernelBundle - std::move(CGData), std::move(Args), - *KRInfo.DeviceKernelInfoPtr(), - std::move(StreamStorage), std::move(AuxiliaryResources), - detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // MKernelIsCooperative - false, // MKernelUsesClusterLaunch - 0, // MKernelWorkGroupMemorySize + KData.getNDRDesc(), HostKernel, + nullptr, // Kernel + nullptr, // KernelBundle + std::move(CGData), std::move(KData).getArgs(), + *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // KernelIsCooperative + false, // KernelUsesClusterLaunch + 0, // KernelWorkGroupMemorySize CodeLoc)); CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; - // TODO DiscardEvent should include a check for requirements list - // once accessors are implemented - bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); - - EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), *this, !DiscardEvent); - return EventImpl; + return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), + *this, true); }; return submit_direct(CallerNeedsEvent, SubmitKernelFunc); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index f2dd6212478e5..546b8f5101092 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -922,7 +922,6 @@ class queue_impl : public std::enable_shared_from_this { /// Performs kernel submission to the queue. /// /// \param NDRDesc is an NDRange descriptor - /// \param SubmitInfo is additional optional information for the submission. /// \param KRInfo is a descriptor of the kernel /// \param CallerNeedsEvent is a boolean indicating whether the event is /// required by the user after the call. @@ -936,6 +935,22 @@ class queue_impl : public std::enable_shared_from_this { bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); + /// Performs kernel submission to the queue. + /// + /// \param KData contains aggregated data related to the kernel + /// \param HostKernel stores the kernel lambda instance + /// \param CallerNeedsEvent is a boolean indicating whether the event is + /// required by the user after the call. + /// \param CodeLoc is the code location of the submit call + /// \param IsTopCodeLoc Used to determine if the object is in a local + /// scope or in the top level scope. + /// + /// \return a SYCL event representing submitted command group or nullptr. + detail::EventImplPtr submit_kernel_direct_impl( + KernelData &KData, std::shared_ptr HostKernel, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); + template detail::EventImplPtr submit_direct(bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc); diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp index c45d72ea4c343..6272b3d7d7a8a 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp @@ -26,40 +26,52 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) { static thread_local size_t counter_urEnqueueKernelLaunch = 0; inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) { ++counter_urEnqueueKernelLaunch; +// The no-handler scheduler submission includes a fix for the event return, +// which is not yet applied to the handler-based path +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueMemcpy = 0; inline ur_result_t redefined_urUSMEnqueueMemcpy(void *pParams) { ++counter_urUSMEnqueueMemcpy; +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueFill = 0; inline ur_result_t redefined_urUSMEnqueueFill(void *pParams) { ++counter_urUSMEnqueueFill; +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueuePrefetch = 0; inline ur_result_t redefined_urUSMEnqueuePrefetch(void *pParams) { ++counter_urUSMEnqueuePrefetch; +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueMemAdvise = 0; inline ur_result_t redefined_urUSMEnqueueMemAdvise(void *pParams) { ++counter_urUSMEnqueueMemAdvise; +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } From 72dc199ed2bc67e0107517f7c3faaa2a23a11191 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 17 Sep 2025 12:28:21 +0000 Subject: [PATCH 37/65] Rename KernelRuntimeInfo to KernelDataDesc --- sycl/include/sycl/queue.hpp | 48 +++++++++++++++---------------- sycl/source/detail/queue_impl.cpp | 8 +++--- sycl/source/detail/queue_impl.hpp | 28 +++++++++--------- sycl/source/queue.cpp | 24 ++++++++-------- 4 files changed, 54 insertions(+), 54 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 3af78be6c8a4e..7bc23565ddd44 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -144,17 +144,17 @@ using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, // extracted from the compile time kernel structures. -class __SYCL_EXPORT KernelRuntimeInfo { +class __SYCL_EXPORT KernelDataDesc { public: - KernelRuntimeInfo() {} + KernelDataDesc() {} - KernelRuntimeInfo(const KernelRuntimeInfo &rhs) = delete; + KernelDataDesc(const KernelDataDesc &rhs) = delete; - KernelRuntimeInfo(KernelRuntimeInfo &&rhs) = delete; + KernelDataDesc(KernelDataDesc &&rhs) = delete; - KernelRuntimeInfo &operator=(const KernelRuntimeInfo &rhs) = delete; + KernelDataDesc &operator=(const KernelDataDesc &rhs) = delete; - KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; + KernelDataDesc &operator=(KernelDataDesc &&rhs) = delete; std::shared_ptr &HostKernel() { return MHostKernel; } const std::shared_ptr &HostKernel() const { @@ -3693,8 +3693,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - void ProcessKernelRuntimeInfo(const KernelType &KernelFunc, - detail::v1::KernelRuntimeInfo &KRInfo) const { + void ProcessKernelDataDesc(const KernelType &KernelFunc, + detail::v1::KernelDataDesc &KDDesc) const { using LambdaArgType = sycl::detail::lambda_arg_type>; using TransformedArgType = std::conditional_t< @@ -3704,11 +3704,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename TransformUserItemType::type>, void>; - KRInfo.HostKernel().reset( + KDDesc.HostKernel().reset( new detail::HostKernel( KernelFunc)); - KRInfo.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); + KDDesc.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); } #ifndef __INTEL_PREVIEW_BREAKING_CHANGES @@ -3783,27 +3783,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; /// A template-free version of submit_without_event as const member function. @@ -3845,18 +3845,18 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::code_location::current()) const { (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::KernelRuntimeInfo KRInfo{}; + detail::v1::KernelDataDesc KDDesc{}; using NameT = typename detail::get_kernel_name_t::name; - ProcessKernelRuntimeInfo(KernelFunc, - KRInfo); + ProcessKernelDataDesc(KernelFunc, + KDDesc); detail::KernelWrapper::wrap(KernelFunc); - return submit_kernel_direct_with_event_impl(Range, KRInfo, + return submit_kernel_direct_with_event_impl(Range, KDDesc, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } @@ -3869,18 +3869,18 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::code_location::current()) const { (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::KernelRuntimeInfo KRInfo{}; + detail::v1::KernelDataDesc KDDesc{}; using NameT = typename detail::get_kernel_name_t::name; - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelDataDesc(KernelFunc, KDDesc); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); - submit_kernel_direct_without_event_impl(Range, KRInfo, + submit_kernel_direct_without_event_impl(Range, KDDesc, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 313a47cb56a46..6b0f98fd91b4e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -439,17 +439,17 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( } detail::EventImplPtr queue_impl::submit_kernel_direct_impl( - const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, + const NDRDescT &NDRDesc, const v1::KernelDataDesc &KDDesc, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; - KData.setDeviceKernelInfoPtr(KRInfo.DeviceKernelInfoPtr()); - KData.setKernelFunc(KRInfo.GetKernelFuncPtr()); + KData.setDeviceKernelInfoPtr(KDDesc.DeviceKernelInfoPtr()); + KData.setKernelFunc(KDDesc.GetKernelFuncPtr()); KData.setNDRDesc(NDRDesc); - return submit_kernel_direct_impl(KData, KRInfo.HostKernel(), CallerNeedsEvent, + return submit_kernel_direct_impl(KData, KDDesc.HostKernel(), CallerNeedsEvent, CodeLoc, IsTopCodeLoc); } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 546b8f5101092..dc5ea2e72f35e 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -360,47 +360,47 @@ class queue_impl : public std::enable_shared_from_this { } event submit_kernel_direct_with_event( - nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); + NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event submit_kernel_direct_with_event( - nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); + NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event submit_kernel_direct_with_event( - nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); + NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } void submit_kernel_direct_without_event( - nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, IsTopCodeLoc); } void submit_kernel_direct_without_event( - nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, IsTopCodeLoc); } void submit_kernel_direct_without_event( - nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, IsTopCodeLoc); } @@ -922,7 +922,7 @@ class queue_impl : public std::enable_shared_from_this { /// Performs kernel submission to the queue. /// /// \param NDRDesc is an NDRange descriptor - /// \param KRInfo is a descriptor of the kernel + /// \param KDDesc is a descriptor of the kernel /// \param CallerNeedsEvent is a boolean indicating whether the event is /// required by the user after the call. /// \param CodeLoc is the code location of the submit call @@ -931,7 +931,7 @@ class queue_impl : public std::enable_shared_from_this { /// /// \return a SYCL event representing submitted command group or nullptr. detail::EventImplPtr submit_kernel_direct_impl( - const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, + const NDRDescT &NDRDesc, const v1::KernelDataDesc &KDDesc, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 14febaa52db95..cb7da34228098 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -320,44 +320,44 @@ event queue::submit_with_event_impl( } event queue::submit_kernel_direct_with_event_impl( - nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, IsTopCodeLoc); } event queue::submit_kernel_direct_with_event_impl( - nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, IsTopCodeLoc); } event queue::submit_kernel_direct_with_event_impl( - nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, IsTopCodeLoc); } From 177277b8f4c71868954e7cc77785d755d97a0730 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 17 Sep 2025 17:47:06 +0000 Subject: [PATCH 38/65] Minor changes in the template variables --- sycl/include/sycl/queue.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 7bc23565ddd44..532fdfb85338c 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2701,10 +2701,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - return submit_kernel_direct_with_event( - ext::oneapi::experimental::empty_properties_t{}, nd_range<1>{1, 1}, - KernelFunc); + return submit_kernel_direct_with_event(Properties, nd_range<1>{1, 1}, + KernelFunc); #else return submit( @@ -3263,8 +3263,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_kernel_direct_with_event>( + return submit_kernel_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3835,9 +3834,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template + template > event submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, From eb9a5d62e36b8324de3d1e7208704c56c01c5bda Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 17 Sep 2025 17:52:59 +0000 Subject: [PATCH 39/65] Remove unused type --- sycl/include/sycl/queue.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 532fdfb85338c..1d9b490e8f9c5 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -140,8 +140,6 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); - // This class is intended to store the kernel runtime information, // extracted from the compile time kernel structures. class __SYCL_EXPORT KernelDataDesc { From 1f8ea92d809152a19d3492ecbd4c2cbac11a1a17 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 22 Sep 2025 10:33:14 +0000 Subject: [PATCH 40/65] Remove KernelDataDesc and pass the arguments directly, limit the scope to parallel_for with nd_range kernel submissions. --- .../oneapi/experimental/enqueue_functions.hpp | 1 + sycl/include/sycl/queue.hpp | 137 +++++++----------- sycl/source/detail/queue_impl.cpp | 17 ++- sycl/source/detail/queue_impl.hpp | 57 +++++--- sycl/source/queue.cpp | 42 +++--- 5 files changed, 118 insertions(+), 136 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 3c0b4606154d7..d79f194a2049e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 1d9b490e8f9c5..fb88f26389831 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -140,40 +140,6 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -// This class is intended to store the kernel runtime information, -// extracted from the compile time kernel structures. -class __SYCL_EXPORT KernelDataDesc { -public: - KernelDataDesc() {} - - KernelDataDesc(const KernelDataDesc &rhs) = delete; - - KernelDataDesc(KernelDataDesc &&rhs) = delete; - - KernelDataDesc &operator=(const KernelDataDesc &rhs) = delete; - - KernelDataDesc &operator=(KernelDataDesc &&rhs) = delete; - - std::shared_ptr &HostKernel() { return MHostKernel; } - const std::shared_ptr &HostKernel() const { - return MHostKernel; - } - - char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } - char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } - - detail::DeviceKernelInfo *&DeviceKernelInfoPtr() { - return MDeviceKernelInfoPtr; - } - detail::DeviceKernelInfo *DeviceKernelInfoPtr() const { - return MDeviceKernelInfoPtr; - } - -private: - std::shared_ptr MHostKernel; - detail::DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; -}; - } // namespace v1 } // namespace detail @@ -2697,21 +2663,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - return submit_kernel_direct_with_event(Properties, nd_range<1>{1, 1}, - KernelFunc); - -#else return submit( [&](handler &CGH) { CGH.template single_task( Properties, KernelFunc); }, TlsCodeLocCapture.query()); -#endif } /// single_task version with a kernel represented as a lambda. @@ -3688,26 +3645,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; - template - void ProcessKernelDataDesc(const KernelType &KernelFunc, - detail::v1::KernelDataDesc &KDDesc) const { - - using LambdaArgType = sycl::detail::lambda_arg_type>; - using TransformedArgType = std::conditional_t< - WrapAsVal == detail::WrapAs::parallel_for, - std::conditional_t< - std::is_integral::value && Dims == 1, item, - typename TransformUserItemType::type>, - void>; - - KDDesc.HostKernel().reset( - new detail::HostKernel( - KernelFunc)); - - KDDesc.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); - } - #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. /// Not using `type_erased_cgfo_ty` on purpose. @@ -3780,27 +3717,33 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<2> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<3> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<2> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<3> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; /// A template-free version of submit_without_event as const member function. @@ -3833,9 +3776,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } template > + typename KernelType, int Dims> event submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3843,20 +3784,31 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::code_location::current()) const { (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::KernelDataDesc KDDesc{}; + std::shared_ptr HostKernel; + detail::DeviceKernelInfo *DeviceKernelInfoPtr = nullptr; using NameT = typename detail::get_kernel_name_t::name; + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; - ProcessKernelDataDesc(KernelFunc, - KDDesc); + HostKernel.reset( + new detail::HostKernel( + KernelFunc)); - detail::KernelWrapper::wrap(KernelFunc); + DeviceKernelInfoPtr = &detail::getDeviceKernelInfo(); + + detail::KernelWrapper::wrap(KernelFunc); - return submit_kernel_direct_with_event_impl(Range, KDDesc, - TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + return submit_kernel_direct_with_event_impl( + Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } template { detail::code_location::current()) const { (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::KernelDataDesc KDDesc{}; + std::shared_ptr HostKernel; + detail::DeviceKernelInfo *DeviceKernelInfoPtr = nullptr; using NameT = typename detail::get_kernel_name_t::name; + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; + + HostKernel.reset( + new detail::HostKernel( + KernelFunc)); - ProcessKernelDataDesc(KernelFunc, KDDesc); + DeviceKernelInfoPtr = &detail::getDeviceKernelInfo(); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); + TransformedArgType, PropertiesT>::wrap(KernelFunc); - submit_kernel_direct_without_event_impl(Range, KDDesc, - TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + submit_kernel_direct_without_event_impl( + Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } /// Submits a command group function object to the queue, in order to be diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6b0f98fd91b4e..22710565c90fa 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -439,22 +439,23 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( } detail::EventImplPtr queue_impl::submit_kernel_direct_impl( - const NDRDescT &NDRDesc, const v1::KernelDataDesc &KDDesc, - bool CallerNeedsEvent, const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + const NDRDescT &NDRDesc, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; - KData.setDeviceKernelInfoPtr(KDDesc.DeviceKernelInfoPtr()); - KData.setKernelFunc(KDDesc.GetKernelFuncPtr()); + KData.setDeviceKernelInfoPtr(DeviceKernelInfo); + KData.setKernelFunc(HostKernel->getPtr()); KData.setNDRDesc(NDRDesc); - return submit_kernel_direct_impl(KData, KDDesc.HostKernel(), CallerNeedsEvent, - CodeLoc, IsTopCodeLoc); + return submit_kernel_direct_impl(KData, HostKernel, CallerNeedsEvent, CodeLoc, + IsTopCodeLoc); } detail::EventImplPtr queue_impl::submit_kernel_direct_impl( - KernelData &KData, std::shared_ptr HostKernel, + KernelData &KData, std::shared_ptr &HostKernel, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index dc5ea2e72f35e..6a605f7d6a4b4 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -360,48 +360,57 @@ class queue_impl : public std::enable_shared_from_this { } event submit_kernel_direct_with_event( - nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = + submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, + true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event submit_kernel_direct_with_event( - nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<2> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = + submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, + true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event submit_kernel_direct_with_event( - nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<3> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = + submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, + true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } void submit_kernel_direct_without_event( - nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, - IsTopCodeLoc); + submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, + false, CodeLoc, IsTopCodeLoc); } void submit_kernel_direct_without_event( - nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<2> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, - IsTopCodeLoc); + submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, + false, CodeLoc, IsTopCodeLoc); } void submit_kernel_direct_without_event( - nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<3> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, - IsTopCodeLoc); + submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, + false, CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -922,7 +931,8 @@ class queue_impl : public std::enable_shared_from_this { /// Performs kernel submission to the queue. /// /// \param NDRDesc is an NDRange descriptor - /// \param KDDesc is a descriptor of the kernel + /// \param HostKernel is a wrapper for a kernel lambda + /// \param DeviceKernelInfo is a structure aggregating kernel related data /// \param CallerNeedsEvent is a boolean indicating whether the event is /// required by the user after the call. /// \param CodeLoc is the code location of the submit call @@ -931,9 +941,10 @@ class queue_impl : public std::enable_shared_from_this { /// /// \return a SYCL event representing submitted command group or nullptr. detail::EventImplPtr submit_kernel_direct_impl( - const NDRDescT &NDRDesc, const v1::KernelDataDesc &KDDesc, - bool CallerNeedsEvent, const detail::code_location &CodeLoc, - bool IsTopCodeLoc); + const NDRDescT &NDRDesc, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); /// Performs kernel submission to the queue. /// @@ -947,7 +958,7 @@ class queue_impl : public std::enable_shared_from_this { /// /// \return a SYCL event representing submitted command group or nullptr. detail::EventImplPtr submit_kernel_direct_impl( - KernelData &KData, std::shared_ptr HostKernel, + KernelData &KData, std::shared_ptr &HostKernel, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index cb7da34228098..b2b239d853b9c 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -320,45 +320,51 @@ event queue::submit_with_event_impl( } event queue::submit_kernel_direct_with_event_impl( - nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, - IsTopCodeLoc); + return impl->submit_kernel_direct_with_event( + Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } event queue::submit_kernel_direct_with_event_impl( - nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<2> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, - IsTopCodeLoc); + return impl->submit_kernel_direct_with_event( + Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } event queue::submit_kernel_direct_with_event_impl( - nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<3> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, - IsTopCodeLoc); + return impl->submit_kernel_direct_with_event( + Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, - IsTopCodeLoc); + impl->submit_kernel_direct_without_event(Range, HostKernel, DeviceKernelInfo, + CodeLoc, IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<2> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, - IsTopCodeLoc); + impl->submit_kernel_direct_without_event(Range, HostKernel, DeviceKernelInfo, + CodeLoc, IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + nd_range<3> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, - IsTopCodeLoc); + impl->submit_kernel_direct_without_event(Range, HostKernel, DeviceKernelInfo, + CodeLoc, IsTopCodeLoc); } void queue::submit_without_event_impl( From 0e48eb666734e90cc5abeb6bd81a1dc409c520e3 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 22 Sep 2025 10:57:10 +0000 Subject: [PATCH 41/65] Code cleanup --- .../oneapi/experimental/enqueue_functions.hpp | 1 - sycl/include/sycl/queue.hpp | 7 ------- sycl/source/detail/queue_impl.cpp | 21 ------------------- sycl/source/detail/queue_impl.hpp | 6 +----- .../FreeFunctionEventsHelpers.hpp | 5 +++-- 5 files changed, 4 insertions(+), 36 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index d79f194a2049e..3c0b4606154d7 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index fb88f26389831..293267a398f5d 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3638,13 +3638,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } - template struct TransformUserItemType { - using type = std::conditional_t< - std::is_convertible_v, LambdaArgType>, nd_item, - std::conditional_t, LambdaArgType>, - item, LambdaArgType>>; - }; - #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. /// Not using `type_erased_cgfo_ty` on purpose. diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 22710565c90fa..d4524950acff3 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -417,27 +417,6 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, return EventImpl; } -std::vector queue_impl::extractArgsAndReqsFromLambda( - char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), - size_t NumKernelParams) { - - size_t IndexShift = 0; - std::vector Args; - - Args.reserve(NumKernelParams); - - for (size_t I = 0; I < NumKernelParams; ++I) { - detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I); - void *Ptr = LambdaPtr + ParamDesc.offset; - const detail::kernel_param_kind_t &Kind = ParamDesc.kind; - const int &Size = ParamDesc.info; - - Args.emplace_back(Kind, Ptr, Size, I + IndexShift); - } - - return Args; -} - detail::EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, std::shared_ptr &HostKernel, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 6a605f7d6a4b4..22df681d42299 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -924,14 +924,10 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); - std::vector extractArgsAndReqsFromLambda( - char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), - size_t NumKernelParams); - /// Performs kernel submission to the queue. /// /// \param NDRDesc is an NDRange descriptor - /// \param HostKernel is a wrapper for a kernel lambda + /// \param HostKernel stores the kernel lambda instance /// \param DeviceKernelInfo is a structure aggregating kernel related data /// \param CallerNeedsEvent is a boolean indicating whether the event is /// required by the user after the call. diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp index 6272b3d7d7a8a..e0a5c9be50c15 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp @@ -26,8 +26,9 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) { static thread_local size_t counter_urEnqueueKernelLaunch = 0; inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) { ++counter_urEnqueueKernelLaunch; -// The no-handler scheduler submission includes a fix for the event return, -// which is not yet applied to the handler-based path +// TODO The no-handler scheduler submission includes a fix for the event return, +// where the event is returned by the scheduler on every submission. This fix +// is not yet applied to the handler-based path. #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); From 0d47ac78c61540f787db7d76510d21621c83c521 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 22 Sep 2025 11:06:35 +0000 Subject: [PATCH 42/65] Update Linux symbols --- sycl/test/abi/sycl_symbols_linux.dump | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 98515b118ef88..b7258df93f75d 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3573,6 +3573,7 @@ _ZN4sycl3_V17handler18RangeRoundingTraceEv _ZN4sycl3_V17handler18ext_oneapi_barrierERKSt6vectorINS0_5eventESaIS3_EE _ZN4sycl3_V17handler18extractArgsAndReqsEv _ZN4sycl3_V17handler19getMaxWorkGroups_v2Ev +_ZN4sycl3_V17handler19setDeviceKernelInfoEONS0_6kernelE _ZN4sycl3_V17handler19supportsUSMMemcpy2DEv _ZN4sycl3_V17handler19supportsUSMMemset2DEv _ZN4sycl3_V17handler20DisableRangeRoundingEv @@ -3595,7 +3596,6 @@ _ZN4sycl3_V17handler21setKernelWorkGroupMemEm _ZN4sycl3_V17handler21setUserFacingNodeTypeENS0_3ext6oneapi12experimental9node_typeE _ZN4sycl3_V17handler22ext_oneapi_fill2d_implEPvmPKvmmm _ZN4sycl3_V17handler22memcpyFromDeviceGlobalEPvPKvbmm -_ZN4sycl3_V17handler19setDeviceKernelInfoEONS0_6kernelE _ZN4sycl3_V17handler22setDeviceKernelInfoPtrEPNS0_6detail16DeviceKernelInfoE _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE _ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE @@ -3755,12 +3755,12 @@ _ZNK4sycl3_V15queue22submit_with_event_implERKNS0_6detail19type_erased_cgfo_tyER _ZNK4sycl3_V15queue25submit_without_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv _ZNK4sycl3_V15queue30ext_oneapi_get_last_event_implEv -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi1EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi2EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi3EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi1EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi2EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi3EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue15reference_countEEENS0_6detail18is_queue_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue6deviceEEENS0_6detail18is_queue_info_descIT_E11return_typeEv From ab6812ae02e5fd85e75e6238644022ebf649a86c Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 22 Sep 2025 11:19:34 +0000 Subject: [PATCH 43/65] Update Windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index bce53d9f078b7..8dd9a053596fb 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -298,7 +298,6 @@ ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@V?$id@$02@23@V?$range@$02@23@1W4mode@access@23@PEAXHH_K_NAEBVproperty_list@23@@Z ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@V?$id@$02@23@V?$range@$02@23@1W4mode@access@23@PEAXHH_N_K4AEBVproperty_list@23@@Z ??0HostProfilingInfo@detail@_V1@sycl@@QEAA@XZ -??0KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAA@XZ ??0LocalAccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VLocalAccessorImplHost@detail@_V1@sycl@@@std@@@Z ??0LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z @@ -509,7 +508,6 @@ ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@AEBUcode_location@123@@Z ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@XZ ??1AccessorBaseHost@detail@_V1@sycl@@QEAA@XZ -??1KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAA@XZ ??1LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ ??1SYCLCategory@detail@_V1@sycl@@UEAA@XZ ??1SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ @@ -713,8 +711,6 @@ ??_Fqueue@_V1@sycl@@QEAAXXZ ?AccessTargetMask@handler@_V1@sycl@@0HB ?Clear@exception_list@_V1@sycl@@AEAAXXZ -?DeviceKernelInfoPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAPEAVDeviceKernelInfo@345@XZ -?DeviceKernelInfoPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAPEAVDeviceKernelInfo@345@XZ ?DirSep@OSUtil@detail@_V1@sycl@@2QEBDEB ?DisableRangeRounding@handler@_V1@sycl@@AEAA_NXZ ?EventMode@SubmissionInfo@detail@_V1@sycl@@QEAAAEAW4event_mode_enum@experimental@oneapi@ext@34@XZ @@ -723,14 +719,8 @@ ?EventMode@SubmissionInfo@v1@detail@_V1@sycl@@QEBAAEBW4event_mode_enum@experimental@oneapi@ext@45@XZ ?GDBMethodsAnchor@SampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ ?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ -?GetKernelFuncPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAPEADXZ -?GetKernelFuncPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAPEADXZ ?GetRangeRoundingSettings@handler@_V1@sycl@@AEAAXAEA_K00@Z ?HasAssociatedAccessor@handler@_V1@sycl@@AEBA_NPEAVAccessorImplHost@detail@23@W4target@access@23@@Z -?HostKernel@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@XZ -?HostKernel@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAAEBV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@XZ -?KernelName@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAVstring@345@XZ -?KernelName@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAAEBVstring@345@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@v1@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@345@XZ @@ -4429,8 +4419,8 @@ ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$01@23@@Z ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@@Z ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@H@Z -?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z ?setKernelFunc@handler@_V1@sycl@@AEAAXPEAX@Z +?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z ?setKernelNameBasedCachePtr@handler@_V1@sycl@@AEAAXPEAUKernelNameBasedCacheT@detail@23@@Z ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z @@ -4477,12 +4467,6 @@ ?storeRawArg@handler@_V1@sycl@@AEAAPEAXAEBVraw_kernel_arg@experimental@oneapi@ext@23@@Z ?storeRawArg@handler@_V1@sycl@@AEAAPEAXPEBX_K@Z ?stringifyErrorCode@detail@_V1@sycl@@YAPEBDH@Z -?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$00@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z -?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$01@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z -?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$02@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z -?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$00@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z -?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$01@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z -?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$02@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@@Z @@ -4491,6 +4475,12 @@ ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z +?submit_kernel_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$00@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@923@_N@Z +?submit_kernel_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$01@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@923@_N@Z +?submit_kernel_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$02@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@923@_N@Z +?submit_kernel_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$00@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@823@_N@Z +?submit_kernel_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$01@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@823@_N@Z +?submit_kernel_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$02@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@823@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@AEBVtype_erased_cgfo_ty@detail@23@AEBVSubmissionInfo@623@AEBUcode_location@623@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@AEBVtype_erased_cgfo_ty@detail@23@AEBVSubmissionInfo@v1@623@AEBUcode_location@623@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBVSubmissionInfo@detail@23@AEBUcode_location@823@_N@Z From a35286dedb5e673653e3673f2bc9c3b3a7ebcfc5 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 22 Sep 2025 14:45:50 +0000 Subject: [PATCH 44/65] Address review comments --- .../oneapi/experimental/enqueue_functions.hpp | 26 +++-------------- sycl/include/sycl/queue.hpp | 28 +++++++------------ sycl/test/abi/sycl_symbols_linux.dump | 2 +- sycl/test/abi/sycl_symbols_windows.dump | 2 +- 4 files changed, 16 insertions(+), 42 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 3c0b4606154d7..d59efed04dd92 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -174,18 +174,6 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, std::forward(CGF), CodeLoc); } -template -event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail:: - submit_kernel_direct_with_event_impl(Q, Props, Range, KernelFunc, - CodeLoc); -} - template void single_task(handler &CGH, const KernelType &KernelObj) { CGH.single_task(KernelObj); @@ -306,16 +294,13 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, if constexpr (sizeof...(ReductionsT) == 0) { submit(std::move(Q), empty_properties_t{}, Range, KernelObj); } else { +#endif submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, std::forward(Reductions)...); }); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } -#else - submit(std::move(Q), [&](handler &CGH) { - nd_launch(CGH, Range, KernelObj, - std::forward(Reductions)...); - }); #endif } @@ -345,16 +330,13 @@ void nd_launch(queue Q, launch_config, Properties> Config, submit(std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), KernelObj); } else { +#endif submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, std::forward(Reductions)...); }); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } -#else - submit(std::move(Q), [&](handler &CGH) { - nd_launch(CGH, Config, KernelObj, - std::forward(Reductions)...); - }); #endif } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 293267a398f5d..4a5b7e748d3b9 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3221,18 +3221,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { return submit_kernel_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { +#endif return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); }, TlsCodeLocCapture.query()); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } -#else - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Rest...); - }, - TlsCodeLocCapture.query()); #endif } @@ -3777,8 +3773,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::code_location::current()) const { (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - std::shared_ptr HostKernel; - detail::DeviceKernelInfo *DeviceKernelInfoPtr = nullptr; using NameT = typename detail::get_kernel_name_t::name; @@ -3790,11 +3784,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "must be either sycl::nd_item or be convertible from sycl::nd_item"); using TransformedArgType = sycl::nd_item; - HostKernel.reset( - new detail::HostKernel( - KernelFunc)); + std::shared_ptr HostKernel = std::make_shared< + detail::HostKernel>(KernelFunc); - DeviceKernelInfoPtr = &detail::getDeviceKernelInfo(); + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); detail::KernelWrapper::wrap(KernelFunc); @@ -3812,8 +3806,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::code_location::current()) const { (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - std::shared_ptr HostKernel; - detail::DeviceKernelInfo *DeviceKernelInfoPtr = nullptr; using NameT = typename detail::get_kernel_name_t::name; @@ -3825,11 +3817,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "must be either sycl::nd_item or be convertible from sycl::nd_item"); using TransformedArgType = sycl::nd_item; - HostKernel.reset( - new detail::HostKernel( - KernelFunc)); + std::shared_ptr HostKernel = std::make_shared< + detail::HostKernel>(KernelFunc); - DeviceKernelInfoPtr = &detail::getDeviceKernelInfo(); + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); detail::KernelWrapper::wrap(KernelFunc); diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index b7258df93f75d..f88d58e1fee03 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3573,7 +3573,6 @@ _ZN4sycl3_V17handler18RangeRoundingTraceEv _ZN4sycl3_V17handler18ext_oneapi_barrierERKSt6vectorINS0_5eventESaIS3_EE _ZN4sycl3_V17handler18extractArgsAndReqsEv _ZN4sycl3_V17handler19getMaxWorkGroups_v2Ev -_ZN4sycl3_V17handler19setDeviceKernelInfoEONS0_6kernelE _ZN4sycl3_V17handler19supportsUSMMemcpy2DEv _ZN4sycl3_V17handler19supportsUSMMemset2DEv _ZN4sycl3_V17handler20DisableRangeRoundingEv @@ -3596,6 +3595,7 @@ _ZN4sycl3_V17handler21setKernelWorkGroupMemEm _ZN4sycl3_V17handler21setUserFacingNodeTypeENS0_3ext6oneapi12experimental9node_typeE _ZN4sycl3_V17handler22ext_oneapi_fill2d_implEPvmPKvmmm _ZN4sycl3_V17handler22memcpyFromDeviceGlobalEPvPKvbmm +_ZN4sycl3_V17handler19setDeviceKernelInfoEONS0_6kernelE _ZN4sycl3_V17handler22setDeviceKernelInfoPtrEPNS0_6detail16DeviceKernelInfoE _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE _ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 8dd9a053596fb..2922e3baf9385 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4419,8 +4419,8 @@ ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$01@23@@Z ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@@Z ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@H@Z -?setKernelFunc@handler@_V1@sycl@@AEAAXPEAX@Z ?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z +?setKernelFunc@handler@_V1@sycl@@AEAAXPEAX@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z ?setKernelNameBasedCachePtr@handler@_V1@sycl@@AEAAXPEAUKernelNameBasedCacheT@detail@23@@Z ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z From 27a5cf5fd5fe69d468083613af83c596105d09bf Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 22 Sep 2025 14:54:06 +0000 Subject: [PATCH 45/65] Rvalue reference for KernelData --- sycl/source/detail/queue_impl.cpp | 6 +++--- sycl/source/detail/queue_impl.hpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index d4524950acff3..bf3860173b319 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -429,12 +429,12 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( KData.setKernelFunc(HostKernel->getPtr()); KData.setNDRDesc(NDRDesc); - return submit_kernel_direct_impl(KData, HostKernel, CallerNeedsEvent, CodeLoc, - IsTopCodeLoc); + return submit_kernel_direct_impl(std::move(KData), HostKernel, + CallerNeedsEvent, CodeLoc, IsTopCodeLoc); } detail::EventImplPtr queue_impl::submit_kernel_direct_impl( - KernelData &KData, std::shared_ptr &HostKernel, + KernelData &&KData, std::shared_ptr &HostKernel, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 22df681d42299..8a40024e3a704 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -954,7 +954,7 @@ class queue_impl : public std::enable_shared_from_this { /// /// \return a SYCL event representing submitted command group or nullptr. detail::EventImplPtr submit_kernel_direct_impl( - KernelData &KData, std::shared_ptr &HostKernel, + KernelData &&KData, std::shared_ptr &HostKernel, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); From 9144f842eb439636587c7ac44b9e42ec13491349 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 23 Sep 2025 09:17:11 +0000 Subject: [PATCH 46/65] Add a static_assert to check if properties are empty. --- sycl/include/sycl/queue.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 4a5b7e748d3b9..97586deeb1a9d 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3771,7 +3771,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { - (void)Props; + // TODO Properties not supported yet + static_assert( + std::is_same_v, + "Setting properties not supported yet for no-CGH kernel submit."); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using NameT = @@ -3804,7 +3808,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { - (void)Props; + // TODO Properties not supported yet + static_assert( + std::is_same_v, + "Setting properties not supported yet for no-CGH kernel submit."); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using NameT = From ca0b632e430313e833ec566d72c3839ce3a492b1 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 23 Sep 2025 09:22:30 +0000 Subject: [PATCH 47/65] Remove unused function --- .../ext/oneapi/experimental/enqueue_functions.hpp | 9 --------- sycl/include/sycl/queue.hpp | 12 ------------ 2 files changed, 21 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index d59efed04dd92..8edd085ec487d 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -120,15 +120,6 @@ void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, Dims>(Props, Range, KernelFunc, CodeLoc); } -template -event submit_kernel_direct_with_event_impl( - const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_kernel_direct_with_event(Props, Range, KernelFunc, - CodeLoc); -} } // namespace detail template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 97586deeb1a9d..5b2111c4e9cac 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -166,11 +166,6 @@ void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -template -event submit_kernel_direct_with_event_impl( - const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); } // namespace detail } // namespace ext::oneapi::experimental @@ -3614,13 +3609,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); - template - friend event - ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl( - const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); - template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { From 42e2b3024c8fa0b76cf589f2cab7400b3bc87b6b Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 23 Sep 2025 09:33:37 +0000 Subject: [PATCH 48/65] Fix unused parameter --- sycl/include/sycl/queue.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 5b2111c4e9cac..56b54a47dae05 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3760,6 +3760,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc = detail::code_location::current()) const { // TODO Properties not supported yet + (void)Props; static_assert( std::is_same_v, @@ -3797,6 +3798,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc = detail::code_location::current()) const { // TODO Properties not supported yet + (void)Props; static_assert( std::is_same_v, From 943f1f7387979fa6aa041f980567acb0a79201e1 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 23 Sep 2025 14:17:33 +0000 Subject: [PATCH 49/65] Merge two overloads of submit_kernel_direct_impl --- sycl/source/detail/queue_impl.cpp | 9 --------- sycl/source/detail/queue_impl.hpp | 16 ---------------- 2 files changed, 25 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index bf3860173b319..4965ab6e8d729 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -429,15 +429,6 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( KData.setKernelFunc(HostKernel->getPtr()); KData.setNDRDesc(NDRDesc); - return submit_kernel_direct_impl(std::move(KData), HostKernel, - CallerNeedsEvent, CodeLoc, IsTopCodeLoc); -} - -detail::EventImplPtr queue_impl::submit_kernel_direct_impl( - KernelData &&KData, std::shared_ptr &HostKernel, - bool CallerNeedsEvent, const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { - auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { std::unique_ptr CommandGroup; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 8a40024e3a704..3f090c4c32e36 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -942,22 +942,6 @@ class queue_impl : public std::enable_shared_from_this { detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); - /// Performs kernel submission to the queue. - /// - /// \param KData contains aggregated data related to the kernel - /// \param HostKernel stores the kernel lambda instance - /// \param CallerNeedsEvent is a boolean indicating whether the event is - /// required by the user after the call. - /// \param CodeLoc is the code location of the submit call - /// \param IsTopCodeLoc Used to determine if the object is in a local - /// scope or in the top level scope. - /// - /// \return a SYCL event representing submitted command group or nullptr. - detail::EventImplPtr submit_kernel_direct_impl( - KernelData &&KData, std::shared_ptr &HostKernel, - bool CallerNeedsEvent, const detail::code_location &CodeLoc, - bool IsTopCodeLoc); - template detail::EventImplPtr submit_direct(bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc); From 76bcaf269c06cb6596a5ed01acf27e3d15eda90f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 24 Sep 2025 10:26:26 +0000 Subject: [PATCH 50/65] Template instantiations for submit_kernel_direct_with_event_impl --- sycl/include/sycl/queue.hpp | 56 ++++++++++++++++++++++--------------- sycl/source/queue.cpp | 44 ++++++++++++++--------------- 2 files changed, 56 insertions(+), 44 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 56b54a47dae05..cb3fb47cddb4c 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3693,33 +3693,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + template event submit_kernel_direct_with_event_impl( - nd_range<1> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - - event submit_kernel_direct_with_event_impl( - nd_range<2> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - - event submit_kernel_direct_with_event_impl( - nd_range<3> Range, std::shared_ptr &HostKernel, + nd_range Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + template void submit_kernel_direct_without_event_impl( - nd_range<1> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - - void submit_kernel_direct_without_event_impl( - nd_range<2> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - - void submit_kernel_direct_without_event_impl( - nd_range<3> Range, std::shared_ptr &HostKernel, + nd_range Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; @@ -3984,6 +3966,36 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; +extern template event queue::submit_kernel_direct_with_event_impl<1>( + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + +extern template event queue::submit_kernel_direct_with_event_impl<2>( + nd_range<2> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + +extern template event queue::submit_kernel_direct_with_event_impl<3>( + nd_range<3> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + +extern template void queue::submit_kernel_direct_without_event_impl<1>( + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + +extern template void queue::submit_kernel_direct_without_event_impl<2>( + nd_range<2> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + +extern template void queue::submit_kernel_direct_without_event_impl<3>( + nd_range<3> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + } // namespace _V1 } // namespace sycl diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index b2b239d853b9c..a3334b1e92ba4 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -319,53 +319,53 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +template event queue::submit_kernel_direct_with_event_impl( - nd_range<1> Range, std::shared_ptr &HostKernel, + nd_range Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { return impl->submit_kernel_direct_with_event( Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_kernel_direct_with_event_impl( +template event queue::submit_kernel_direct_with_event_impl<1>( + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + +template event queue::submit_kernel_direct_with_event_impl<2>( nd_range<2> Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); -} + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -event queue::submit_kernel_direct_with_event_impl( +template event queue::submit_kernel_direct_with_event_impl<3>( nd_range<3> Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); -} + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; +template void queue::submit_kernel_direct_without_event_impl( - nd_range<1> Range, std::shared_ptr &HostKernel, + nd_range Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { impl->submit_kernel_direct_without_event(Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_kernel_direct_without_event_impl( +template void queue::submit_kernel_direct_without_event_impl<1>( + nd_range<1> Range, std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + +template void queue::submit_kernel_direct_without_event_impl<2>( nd_range<2> Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, HostKernel, DeviceKernelInfo, - CodeLoc, IsTopCodeLoc); -} + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -void queue::submit_kernel_direct_without_event_impl( +template void queue::submit_kernel_direct_without_event_impl<3>( nd_range<3> Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, HostKernel, DeviceKernelInfo, - CodeLoc, IsTopCodeLoc); -} + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, From 6588fe806f6318fcbf647db674a543ed191e26b2 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 24 Sep 2025 10:36:51 +0000 Subject: [PATCH 51/65] Change kernel direct submit functions in queue_impl to templates --- sycl/source/detail/queue_impl.hpp | 42 +++---------------------------- 1 file changed, 4 insertions(+), 38 deletions(-) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 3f090c4c32e36..b637dcccc2cc2 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -359,8 +359,9 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } + template event submit_kernel_direct_with_event( - nd_range<1> Range, std::shared_ptr &HostKernel, + nd_range Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = @@ -369,44 +370,9 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(EventImpl); } - event submit_kernel_direct_with_event( - nd_range<2> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = - submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - true, CodeLoc, IsTopCodeLoc); - return createSyclObjFromImpl(EventImpl); - } - - event submit_kernel_direct_with_event( - nd_range<3> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = - submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - true, CodeLoc, IsTopCodeLoc); - return createSyclObjFromImpl(EventImpl); - } - - void submit_kernel_direct_without_event( - nd_range<1> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - false, CodeLoc, IsTopCodeLoc); - } - - void submit_kernel_direct_without_event( - nd_range<2> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - false, CodeLoc, IsTopCodeLoc); - } - + template void submit_kernel_direct_without_event( - nd_range<3> Range, std::shared_ptr &HostKernel, + nd_range Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, From 3c0e33cafed68094045865daafff02cd0c1771ba Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 24 Sep 2025 12:11:29 +0000 Subject: [PATCH 52/65] Update Linux symbols --- sycl/test/abi/sycl_symbols_linux.dump | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index f88d58e1fee03..76f9a5e543136 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3755,12 +3755,12 @@ _ZNK4sycl3_V15queue22submit_with_event_implERKNS0_6detail19type_erased_cgfo_tyER _ZNK4sycl3_V15queue25submit_without_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv _ZNK4sycl3_V15queue30ext_oneapi_get_last_event_implEv -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi1EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi2EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi3EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi1EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi2EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi3EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS5_16DeviceKernelInfoERKNS5_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implILi1EEENS0_5eventENS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS7_16DeviceKernelInfoERKNS7_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implILi2EEENS0_5eventENS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS7_16DeviceKernelInfoERKNS7_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implILi3EEENS0_5eventENS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS7_16DeviceKernelInfoERKNS7_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implILi1EEEvNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS6_16DeviceKernelInfoERKNS6_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implILi2EEEvNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS6_16DeviceKernelInfoERKNS6_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implILi3EEEvNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS6_16DeviceKernelInfoERKNS6_13code_locationEb _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue15reference_countEEENS0_6detail18is_queue_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue6deviceEEENS0_6detail18is_queue_info_descIT_E11return_typeEv From 8a20b8a301146786538376ab3cf3a19f27eef3f1 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 24 Sep 2025 12:20:36 +0000 Subject: [PATCH 53/65] Update Windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 2922e3baf9385..2c6be6c1684bb 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4475,12 +4475,6 @@ ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z -?submit_kernel_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$00@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@923@_N@Z -?submit_kernel_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$01@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@923@_N@Z -?submit_kernel_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$02@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@923@_N@Z -?submit_kernel_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$00@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@823@_N@Z -?submit_kernel_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$01@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@823@_N@Z -?submit_kernel_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$02@23@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@23@AEBUcode_location@823@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@AEBVtype_erased_cgfo_ty@detail@23@AEBVSubmissionInfo@623@AEBUcode_location@623@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@AEBVtype_erased_cgfo_ty@detail@23@AEBVSubmissionInfo@v1@623@AEBUcode_location@623@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBVSubmissionInfo@detail@23@AEBUcode_location@823@_N@Z From 2be3d3da839fbeba5921abc78caddef28ac0b431 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 25 Sep 2025 07:49:09 +0000 Subject: [PATCH 54/65] Convert the kernel direct submit functions to free functions --- .../oneapi/experimental/enqueue_functions.hpp | 29 +-- .../sycl/khr/free_function_commands.hpp | 5 +- sycl/include/sycl/queue.hpp | 183 ++++++++++-------- 3 files changed, 111 insertions(+), 106 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 8edd085ec487d..8f558432b81cc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -110,16 +110,6 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, return Q.submit_with_event(Props, detail::type_erased_cgfo_ty{CGF}, CodeLoc); } -template -void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc) { - Q.submit_kernel_direct_without_event(Props, Range, KernelFunc, CodeLoc); -} - } // namespace detail template @@ -137,17 +127,6 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, submit(Q, empty_properties_t{}, std::forward(CGF), CodeLoc); } -template -void submit(const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - sycl::ext::oneapi::experimental::detail::submit_kernel_direct_impl< - KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, - CodeLoc); -} - template event submit_with_event(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, @@ -283,7 +262,8 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(ReductionsT) == 0) { - submit(std::move(Q), empty_properties_t{}, Range, KernelObj); + detail::submit_kernel_direct_without_event( + std::move(Q), empty_properties_t{}, Range, KernelObj); } else { #endif submit(std::move(Q), [&](handler &CGH) { @@ -318,8 +298,9 @@ void nd_launch(queue Q, launch_config, Properties> Config, ext::oneapi::experimental::detail::LaunchConfigAccess, Properties> ConfigAccess(Config); - submit(std::move(Q), ConfigAccess.getProperties(), - ConfigAccess.getRange(), KernelObj); + detail::submit_kernel_direct_without_event( + std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), + KernelObj); } else { #endif submit(std::move(Q), [&](handler &CGH) { diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 4dd7c067126a1..000d6ce8738a4 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -154,8 +154,9 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<1>(r, size), k); + detail::submit_kernel_direct_without_event( + q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size), + k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index cb3fb47cddb4c..9cfb67d7cd623 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -141,6 +141,21 @@ class __SYCL_EXPORT SubmissionInfo { }; } // namespace v1 + +template +event submit_kernel_direct_with_event( + const queue &Queue, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = detail::code_location::current()); + +template +void submit_kernel_direct_without_event( + const queue &Queue, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = detail::code_location::current()); + } // namespace detail namespace ext ::oneapi ::experimental { @@ -3605,9 +3620,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - friend void ext::oneapi::experimental::detail::submit_kernel_direct_impl( - const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + friend void sycl::detail::submit_kernel_direct_without_event( + const queue &Queue, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const detail::code_location &CodeLoc); + + template + friend event sycl::detail::submit_kernel_direct_with_event( + const queue &Queue, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const detail::code_location &CodeLoc); template void ProcessSubmitProperties(PropertiesT Props, @@ -3734,83 +3755,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template - event - submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { - // TODO Properties not supported yet - (void)Props; - static_assert( - std::is_same_v, - "Setting properties not supported yet for no-CGH kernel submit."); - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - - using NameT = - typename detail::get_kernel_name_t::name; - using LambdaArgType = - sycl::detail::lambda_arg_type>; - static_assert( - std::is_convertible_v, LambdaArgType>, - "Kernel argument of a sycl::parallel_for with sycl::nd_range " - "must be either sycl::nd_item or be convertible from sycl::nd_item"); - using TransformedArgType = sycl::nd_item; - - std::shared_ptr HostKernel = std::make_shared< - detail::HostKernel>(KernelFunc); - - detail::DeviceKernelInfo *DeviceKernelInfoPtr = - &detail::getDeviceKernelInfo(); - - detail::KernelWrapper::wrap(KernelFunc); - - return submit_kernel_direct_with_event_impl( - Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } - - template - void submit_kernel_direct_without_event( - PropertiesT Props, nd_range Range, const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { - // TODO Properties not supported yet - (void)Props; - static_assert( - std::is_same_v, - "Setting properties not supported yet for no-CGH kernel submit."); - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - - using NameT = - typename detail::get_kernel_name_t::name; - using LambdaArgType = - sycl::detail::lambda_arg_type>; - static_assert( - std::is_convertible_v, LambdaArgType>, - "Kernel argument of a sycl::parallel_for with sycl::nd_range " - "must be either sycl::nd_item or be convertible from sycl::nd_item"); - using TransformedArgType = sycl::nd_item; - - std::shared_ptr HostKernel = std::make_shared< - detail::HostKernel>(KernelFunc); - - detail::DeviceKernelInfo *DeviceKernelInfoPtr = - &detail::getDeviceKernelInfo(); - - detail::KernelWrapper::wrap(KernelFunc); - - submit_kernel_direct_without_event_impl( - Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } - /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// @@ -3996,6 +3940,85 @@ extern template void queue::submit_kernel_direct_without_event_impl<3>( detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; +namespace detail { + +template +event submit_kernel_direct_with_event(const queue &Queue, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc) { + // TODO Properties not supported yet + (void)Props; + static_assert( + std::is_same_v, + "Setting properties not supported yet for no-CGH kernel submit."); + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; + + std::shared_ptr HostKernel = std::make_shared< + detail::HostKernel>(KernelFunc); + + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); + + detail::KernelWrapper::wrap(KernelFunc); + + return Queue.submit_kernel_direct_with_event_impl( + Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); +} + +template +void submit_kernel_direct_without_event(const queue &Queue, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc) { + // TODO Properties not supported yet + (void)Props; + static_assert( + std::is_same_v, + "Setting properties not supported yet for no-CGH kernel submit."); + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; + + std::shared_ptr HostKernel = std::make_shared< + detail::HostKernel>(KernelFunc); + + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); + + detail::KernelWrapper::wrap(KernelFunc); + + Queue.submit_kernel_direct_without_event_impl( + Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); +} +} // namespace detail + } // namespace _V1 } // namespace sycl From f139c93bcd689c10cfe615dafd7e6a28a00ed610 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 25 Sep 2025 08:01:27 +0000 Subject: [PATCH 55/65] Minor fixes --- sycl/include/sycl/queue.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 9cfb67d7cd623..2f728582f5a40 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3228,8 +3228,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_kernel_direct_with_event( - ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); + return detail::submit_kernel_direct_with_event( + *this, ext::oneapi::experimental::empty_properties_t{}, Range, + Rest...); } else { #endif return submit( @@ -3942,8 +3943,8 @@ extern template void queue::submit_kernel_direct_without_event_impl<3>( namespace detail { -template +template event submit_kernel_direct_with_event(const queue &Queue, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3980,8 +3981,8 @@ event submit_kernel_direct_with_event(const queue &Queue, PropertiesT Props, TlsCodeLocCapture.isToplevel()); } -template +template void submit_kernel_direct_without_event(const queue &Queue, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, From 8023ec1ed08eb64ae7a980a871b63bc96f66e457 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 25 Sep 2025 14:33:08 +0000 Subject: [PATCH 56/65] Add missing calls. --- sycl/include/sycl/khr/free_function_commands.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 000d6ce8738a4..31ef30745b653 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -169,8 +169,9 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<2>(r, size), k); + detail::submit_kernel_direct_without_event( + q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size), + k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -183,8 +184,9 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<3>(r, size), k); + detail::submit_kernel_direct_without_event( + q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size), + k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, From de94db6b9b4263a63f85e102832da97b9f4705d5 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 26 Sep 2025 09:40:49 +0000 Subject: [PATCH 57/65] Remove extern template definitions --- sycl/include/sycl/queue.hpp | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 2f728582f5a40..b68339d8dc879 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3911,36 +3911,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; -extern template event queue::submit_kernel_direct_with_event_impl<1>( - nd_range<1> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -extern template event queue::submit_kernel_direct_with_event_impl<2>( - nd_range<2> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -extern template event queue::submit_kernel_direct_with_event_impl<3>( - nd_range<3> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -extern template void queue::submit_kernel_direct_without_event_impl<1>( - nd_range<1> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -extern template void queue::submit_kernel_direct_without_event_impl<2>( - nd_range<2> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -extern template void queue::submit_kernel_direct_without_event_impl<3>( - nd_range<3> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - namespace detail { template Date: Fri, 26 Sep 2025 10:00:50 +0000 Subject: [PATCH 58/65] Consolidate the event-based and event-less functions --- .../oneapi/experimental/enqueue_functions.hpp | 6 +- .../sycl/khr/free_function_commands.hpp | 18 ++-- sycl/include/sycl/queue.hpp | 87 +++++-------------- 3 files changed, 33 insertions(+), 78 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 8f558432b81cc..53a2d707fbaf8 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -262,8 +262,8 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(ReductionsT) == 0) { - detail::submit_kernel_direct_without_event( - std::move(Q), empty_properties_t{}, Range, KernelObj); + detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, + Range, KernelObj); } else { #endif submit(std::move(Q), [&](handler &CGH) { @@ -298,7 +298,7 @@ void nd_launch(queue Q, launch_config, Properties> Config, ext::oneapi::experimental::detail::LaunchConfigAccess, Properties> ConfigAccess(Config); - detail::submit_kernel_direct_without_event( + detail::submit_kernel_direct( std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), KernelObj); } else { diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 31ef30745b653..c0d2afc4115ad 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -154,9 +154,9 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - detail::submit_kernel_direct_without_event( - q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size), - k); + detail::submit_kernel_direct(q, + ext::oneapi::experimental::empty_properties_t{}, + nd_range<1>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -169,9 +169,9 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - detail::submit_kernel_direct_without_event( - q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size), - k); + detail::submit_kernel_direct(q, + ext::oneapi::experimental::empty_properties_t{}, + nd_range<2>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -184,9 +184,9 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - detail::submit_kernel_direct_without_event( - q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size), - k); + detail::submit_kernel_direct(q, + ext::oneapi::experimental::empty_properties_t{}, + nd_range<3>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b68339d8dc879..10bdc59d0a334 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -142,16 +142,9 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace v1 -template -event submit_kernel_direct_with_event( - const queue &Queue, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = detail::code_location::current()); - -template -void submit_kernel_direct_without_event( +template +auto submit_kernel_direct( const queue &Queue, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()); @@ -3228,7 +3221,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return detail::submit_kernel_direct_with_event( + return detail::submit_kernel_direct( *this, ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { @@ -3619,15 +3612,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); - template - friend void sycl::detail::submit_kernel_direct_without_event( - const queue &Queue, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, const detail::code_location &CodeLoc); - - template - friend event sycl::detail::submit_kernel_direct_with_event( + template + friend auto sycl::detail::submit_kernel_direct( const queue &Queue, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc); @@ -3913,12 +3900,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { namespace detail { -template -event submit_kernel_direct_with_event(const queue &Queue, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc) { +template +auto submit_kernel_direct(const queue &Queue, PropertiesT Props, + nd_range Range, const KernelType &KernelFunc, + const detail::code_location &CodeLoc) { // TODO Properties not supported yet (void)Props; static_assert( @@ -3946,48 +3932,17 @@ event submit_kernel_direct_with_event(const queue &Queue, PropertiesT Props, detail::KernelWrapper::wrap(KernelFunc); - return Queue.submit_kernel_direct_with_event_impl( - Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + if constexpr (EventNeeded) { + return Queue.submit_kernel_direct_with_event_impl( + Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } else { + Queue.submit_kernel_direct_without_event_impl( + Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } } -template -void submit_kernel_direct_without_event(const queue &Queue, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc) { - // TODO Properties not supported yet - (void)Props; - static_assert( - std::is_same_v, - "Setting properties not supported yet for no-CGH kernel submit."); - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - - using NameT = - typename detail::get_kernel_name_t::name; - using LambdaArgType = - sycl::detail::lambda_arg_type>; - static_assert( - std::is_convertible_v, LambdaArgType>, - "Kernel argument of a sycl::parallel_for with sycl::nd_range " - "must be either sycl::nd_item or be convertible from sycl::nd_item"); - using TransformedArgType = sycl::nd_item; - - std::shared_ptr HostKernel = std::make_shared< - detail::HostKernel>(KernelFunc); - - detail::DeviceKernelInfo *DeviceKernelInfoPtr = - &detail::getDeviceKernelInfo(); - - detail::KernelWrapper::wrap(KernelFunc); - - Queue.submit_kernel_direct_without_event_impl( - Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); -} } // namespace detail } // namespace _V1 From eed05919947d1d03434fa99dbd5167e3d9225e6a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 26 Sep 2025 11:03:06 +0000 Subject: [PATCH 59/65] Make free functions from the queue kernel direct submit methods --- sycl/include/sycl/queue.hpp | 118 ++++++++++++++++-------------------- sycl/source/queue.cpp | 104 ++++++++++++++++--------------- 2 files changed, 108 insertions(+), 114 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 10bdc59d0a334..2fd790928a32a 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -62,6 +62,20 @@ template auto get_native(const SyclObjectT &Obj) -> backend_return_t; +template +event __SYCL_EXPORT submit_kernel_direct_with_event_impl( + const queue &Queue, nd_range Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + +template +void __SYCL_EXPORT submit_kernel_direct_without_event_impl( + const queue &Queue, nd_range Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + namespace detail { class queue_impl; @@ -147,7 +161,44 @@ template Range, const KernelType &KernelFunc, - const detail::code_location &CodeLoc = detail::code_location::current()); + const detail::code_location &CodeLoc = detail::code_location::current()) { + // TODO Properties not supported yet + (void)Props; + static_assert( + std::is_same_v, + "Setting properties not supported yet for no-CGH kernel submit."); + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; + + std::shared_ptr HostKernel = std::make_shared< + detail::HostKernel>(KernelFunc); + + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); + + detail::KernelWrapper::wrap(KernelFunc); + + if constexpr (EventNeeded) { + return submit_kernel_direct_with_event_impl( + Queue, Range, HostKernel, DeviceKernelInfoPtr, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + } else { + submit_kernel_direct_without_event_impl( + Queue, Range, HostKernel, DeviceKernelInfoPtr, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + } +} } // namespace detail @@ -3612,12 +3663,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); - template - friend auto sycl::detail::submit_kernel_direct( - const queue &Queue, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, const detail::code_location &CodeLoc); - template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3702,18 +3747,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - template - event submit_kernel_direct_with_event_impl( - nd_range Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - - template - void submit_kernel_direct_without_event_impl( - nd_range Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3898,53 +3931,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; -namespace detail { - -template -auto submit_kernel_direct(const queue &Queue, PropertiesT Props, - nd_range Range, const KernelType &KernelFunc, - const detail::code_location &CodeLoc) { - // TODO Properties not supported yet - (void)Props; - static_assert( - std::is_same_v, - "Setting properties not supported yet for no-CGH kernel submit."); - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - - using NameT = - typename detail::get_kernel_name_t::name; - using LambdaArgType = - sycl::detail::lambda_arg_type>; - static_assert( - std::is_convertible_v, LambdaArgType>, - "Kernel argument of a sycl::parallel_for with sycl::nd_range " - "must be either sycl::nd_item or be convertible from sycl::nd_item"); - using TransformedArgType = sycl::nd_item; - - std::shared_ptr HostKernel = std::make_shared< - detail::HostKernel>(KernelFunc); - - detail::DeviceKernelInfo *DeviceKernelInfoPtr = - &detail::getDeviceKernelInfo(); - - detail::KernelWrapper::wrap(KernelFunc); - - if constexpr (EventNeeded) { - return Queue.submit_kernel_direct_with_event_impl( - Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } else { - Queue.submit_kernel_direct_without_event_impl( - Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } -} - -} // namespace detail - } // namespace _V1 } // namespace sycl diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index a3334b1e92ba4..f1df3b27c5e89 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -319,54 +319,6 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -template -event queue::submit_kernel_direct_with_event_impl( - nd_range Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); -} - -template event queue::submit_kernel_direct_with_event_impl<1>( - nd_range<1> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -template event queue::submit_kernel_direct_with_event_impl<2>( - nd_range<2> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -template event queue::submit_kernel_direct_with_event_impl<3>( - nd_range<3> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -template -void queue::submit_kernel_direct_without_event_impl( - nd_range Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_kernel_direct_without_event(Range, HostKernel, DeviceKernelInfo, - CodeLoc, IsTopCodeLoc); -} - -template void queue::submit_kernel_direct_without_event_impl<1>( - nd_range<1> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -template void queue::submit_kernel_direct_without_event_impl<2>( - nd_range<2> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - -template void queue::submit_kernel_direct_without_event_impl<3>( - nd_range<3> Range, std::shared_ptr &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, @@ -528,6 +480,62 @@ void queue::ext_oneapi_set_external_event(const event &external_event) { const property_list &queue::getPropList() const { return impl->getPropList(); } +template +event submit_kernel_direct_with_event_impl( + const queue &Queue, nd_range Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( + Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); +} + +template event submit_kernel_direct_with_event_impl<1>( + const queue &Queue, nd_range<1> Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + +template event submit_kernel_direct_with_event_impl<2>( + const queue &Queue, nd_range<2> Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + +template event submit_kernel_direct_with_event_impl<3>( + const queue &Queue, nd_range<3> Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + +template +void submit_kernel_direct_without_event_impl( + const queue &Queue, nd_range Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + getSyclObjImpl(Queue)->submit_kernel_direct_without_event( + Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); +} + +template void submit_kernel_direct_without_event_impl<1>( + const queue &Queue, nd_range<1> Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + +template void submit_kernel_direct_without_event_impl<2>( + const queue &Queue, nd_range<2> Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + +template void submit_kernel_direct_without_event_impl<3>( + const queue &Queue, nd_range<3> Range, + std::shared_ptr &HostKernel, + detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + } // namespace _V1 } // namespace sycl From 552f44812ef686e0639fb0615ea393d8df125b81 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 26 Sep 2025 11:38:01 +0000 Subject: [PATCH 60/65] Address review comments --- .../oneapi/experimental/enqueue_functions.hpp | 13 ++++++------- sycl/include/sycl/khr/free_function_commands.hpp | 4 ++-- sycl/include/sycl/queue.hpp | 12 ++++++------ sycl/source/detail/queue_impl.hpp | 6 ++++-- sycl/source/queue.cpp | 16 ++++++++-------- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 53a2d707fbaf8..3dc28532b2372 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -109,7 +109,6 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc) { return Q.submit_with_event(Props, detail::type_erased_cgfo_ty{CGF}, CodeLoc); } - } // namespace detail template @@ -261,18 +260,18 @@ template Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // TODO The handler-less path does not support reductions yet. if constexpr (sizeof...(ReductionsT) == 0) { detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, Range, KernelObj); - } else { + } else #endif + { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, std::forward(Reductions)...); }); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } -#endif } template , Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // TODO The handler-less path does not support reductions yet. if constexpr (sizeof...(ReductionsT) == 0) { ext::oneapi::experimental::detail::LaunchConfigAccess, Properties> @@ -301,15 +301,14 @@ void nd_launch(queue Q, launch_config, Properties> Config, detail::submit_kernel_direct( std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), KernelObj); - } else { + } else #endif + { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, std::forward(Reductions)...); }); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } -#endif } template diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index c0d2afc4115ad..136f11d4e71ab 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -5,7 +5,7 @@ namespace sycl { inline namespace _V1 { -#ifdef __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS +//#ifdef __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS namespace khr { template @@ -538,6 +538,6 @@ inline void event_barrier(const queue &q, const std::vector &events, } } // namespace khr -#endif +//#endif } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 2fd790928a32a..273db4b8356e6 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -64,14 +64,14 @@ auto get_native(const SyclObjectT &Obj) template event __SYCL_EXPORT submit_kernel_direct_with_event_impl( - const queue &Queue, nd_range Range, + const queue &Queue, const nd_range &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl( - const queue &Queue, nd_range Range, + const queue &Queue, const nd_range &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); @@ -159,7 +159,7 @@ class __SYCL_EXPORT SubmissionInfo { template auto submit_kernel_direct( - const queue &Queue, PropertiesT Props, nd_range Range, + const queue &Queue, PropertiesT Props, const nd_range &Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) { // TODO Properties not supported yet @@ -3271,20 +3271,20 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // TODO The handler-less path does not support reductions yet. if constexpr (sizeof...(RestT) == 1) { return detail::submit_kernel_direct( *this, ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); - } else { + } else #endif + { return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); }, TlsCodeLocCapture.query()); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } -#endif } /// parallel_for version with a kernel represented as a lambda + nd_range that diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index b637dcccc2cc2..c3d6748695423 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -361,7 +361,8 @@ class queue_impl : public std::enable_shared_from_this { template event submit_kernel_direct_with_event( - nd_range Range, std::shared_ptr &HostKernel, + const nd_range &Range, + std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = @@ -372,7 +373,8 @@ class queue_impl : public std::enable_shared_from_this { template void submit_kernel_direct_without_event( - nd_range Range, std::shared_ptr &HostKernel, + const nd_range &Range, + std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index f1df3b27c5e89..5338f595c1d1d 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -482,7 +482,7 @@ const property_list &queue::getPropList() const { return impl->getPropList(); } template event submit_kernel_direct_with_event_impl( - const queue &Queue, nd_range Range, + const queue &Queue, const nd_range &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { @@ -491,26 +491,26 @@ event submit_kernel_direct_with_event_impl( } template event submit_kernel_direct_with_event_impl<1>( - const queue &Queue, nd_range<1> Range, + const queue &Queue, const nd_range<1> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event submit_kernel_direct_with_event_impl<2>( - const queue &Queue, nd_range<2> Range, + const queue &Queue, const nd_range<2> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event submit_kernel_direct_with_event_impl<3>( - const queue &Queue, nd_range<3> Range, + const queue &Queue, const nd_range<3> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void submit_kernel_direct_without_event_impl( - const queue &Queue, nd_range Range, + const queue &Queue, const nd_range &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { @@ -519,19 +519,19 @@ void submit_kernel_direct_without_event_impl( } template void submit_kernel_direct_without_event_impl<1>( - const queue &Queue, nd_range<1> Range, + const queue &Queue, const nd_range<1> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void submit_kernel_direct_without_event_impl<2>( - const queue &Queue, nd_range<2> Range, + const queue &Queue, const nd_range<2> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void submit_kernel_direct_without_event_impl<3>( - const queue &Queue, nd_range<3> Range, + const queue &Queue, const nd_range<3> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); From f5c0d77b2d6da7e1fb53810406ac119bb1440441 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 26 Sep 2025 12:32:51 +0000 Subject: [PATCH 61/65] Update Linux symbols --- sycl/test/abi/sycl_symbols_linux.dump | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 76f9a5e543136..ec2a21ef34424 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2985,6 +2985,12 @@ _ZN4sycl3_V121__isgreaterequal_implEdd _ZN4sycl3_V121__isgreaterequal_implEff _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSB_16DeviceKernelInfoERKNSB_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSB_16DeviceKernelInfoERKNSB_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSB_16DeviceKernelInfoERKNSB_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE _ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv @@ -3755,12 +3761,6 @@ _ZNK4sycl3_V15queue22submit_with_event_implERKNS0_6detail19type_erased_cgfo_tyER _ZNK4sycl3_V15queue25submit_without_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv _ZNK4sycl3_V15queue30ext_oneapi_get_last_event_implEv -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implILi1EEENS0_5eventENS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS7_16DeviceKernelInfoERKNS7_13code_locationEb -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implILi2EEENS0_5eventENS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS7_16DeviceKernelInfoERKNS7_13code_locationEb -_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implILi3EEENS0_5eventENS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS7_16DeviceKernelInfoERKNS7_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implILi1EEEvNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS6_16DeviceKernelInfoERKNS6_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implILi2EEEvNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS6_16DeviceKernelInfoERKNS6_13code_locationEb -_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implILi3EEEvNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNS6_16DeviceKernelInfoERKNS6_13code_locationEb _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue15reference_countEEENS0_6detail18is_queue_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue6deviceEEENS0_6detail18is_queue_info_descIT_E11return_typeEv From b442d3743f116bb0379b37941390a876a709b91a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 26 Sep 2025 12:50:18 +0000 Subject: [PATCH 62/65] ifdef fix --- sycl/include/sycl/khr/free_function_commands.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 136f11d4e71ab..c0d2afc4115ad 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -5,7 +5,7 @@ namespace sycl { inline namespace _V1 { -//#ifdef __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS +#ifdef __DPCPP_ENABLE_UNFINISHED_KHR_EXTENSIONS namespace khr { template @@ -538,6 +538,6 @@ inline void event_barrier(const queue &q, const std::vector &events, } } // namespace khr -//#endif +#endif } // namespace _V1 } // namespace sycl From 5fa8cccbb7f235bf2534bb261cf6b8f735ee7cd4 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 26 Sep 2025 16:12:18 +0000 Subject: [PATCH 63/65] Removed unused function declaration --- sycl/include/sycl/queue.hpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 273db4b8356e6..24fc71e9dff3a 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -217,14 +217,6 @@ template event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); - -template -void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); - } // namespace detail } // namespace ext::oneapi::experimental From fa6d2f8b3fe0b12e912184e237937bf06ff35d30 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 26 Sep 2025 16:40:52 +0000 Subject: [PATCH 64/65] Export template instantiations --- sycl/source/queue.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 5338f595c1d1d..ea64c80668958 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -490,19 +490,19 @@ event submit_kernel_direct_with_event_impl( Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } -template event submit_kernel_direct_with_event_impl<1>( +template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -template event submit_kernel_direct_with_event_impl<2>( +template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -template event submit_kernel_direct_with_event_impl<3>( +template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, @@ -518,19 +518,19 @@ void submit_kernel_direct_without_event_impl( Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } -template void submit_kernel_direct_without_event_impl<1>( +template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -template void submit_kernel_direct_without_event_impl<2>( +template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -template void submit_kernel_direct_without_event_impl<3>( +template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, From f04ed3f2f5d2389d258d44ef9b0577cfeec92213 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 26 Sep 2025 16:47:30 +0000 Subject: [PATCH 65/65] Update Windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 2c6be6c1684bb..8eed8b8dba437 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -286,6 +286,12 @@ ??$is_image_handle_supported@Usampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVdevice@34@AEBVcontext@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z +??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@01@AEBUcode_location@801@_N@Z +??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@01@AEBUcode_location@801@_N@Z +??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@01@AEBUcode_location@801@_N@Z +??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@01@AEBUcode_location@701@_N@Z +??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@01@AEBUcode_location@701@_N@Z +??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@PEAVDeviceKernelInfo@detail@01@AEBUcode_location@701@_N@Z ??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z ??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z ??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z