Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Specify access mode in __result_and_scratch_storage methods #1909

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
// 1. Local scan on each workgroup
auto __submit_event = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer
// TODO what is correct access mode here for __get_scratch_acc call?
Copy link
Contributor Author

@SergeyKopienko SergeyKopienko Oct 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@danhoeflinger do you know the correct template param state here and in other TODO's ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this can be write

// Is default sycl::access_mode::read_write is ok?
auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh);
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
Expand All @@ -340,6 +342,8 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
auto __iters_per_single_wg = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __wgroup_size);
__submit_event = __exec.queue().submit([&](sycl::handler& __cgh) {
__cgh.depends_on(__submit_event);
// TODO what is correct access mode here for __get_scratch_acc call?
// Is default sycl::access_mode::read_write is ok?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think read_write.

auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh);
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
Expand All @@ -362,7 +366,11 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
auto __final_event = __exec.queue().submit([&](sycl::handler& __cgh) {
__cgh.depends_on(__submit_event);
oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer
// TODO what is correct access mode here for __get_scratch_acc call?
// Is default sycl::access_mode::read_write is ok?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i believe read.

auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh);
// TODO what is right access mode here?
// sycl::access_mode::read_write is used by default.
Comment on lines +372 to +373
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i believe write

auto __res_acc = __result_and_scratch.__get_result_acc(__cgh);
__cgh.parallel_for<_PropagateScanName...>(sycl::range<1>(__n_groups * __size_per_wg), [=](auto __item) {
auto __temp_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__temp_acc);
Expand Down Expand Up @@ -579,7 +587,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
// predicate on each element of the input range. The second half stores the index of the output
// range to copy elements of the input range.
auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg * 2}, __hdl);
auto __res_acc = __result.__get_result_acc(__hdl);
auto __res_acc = __result.__get_result_acc<sycl::access_mode::write>(__hdl);

__hdl.parallel_for<_ScanKernelName...>(
sycl::nd_range<1>(_WGSize, _WGSize), [=](sycl::nd_item<1> __self_item) {
Expand Down Expand Up @@ -1466,7 +1474,7 @@ __parallel_find_or_impl_one_wg(oneapi::dpl::__internal::__device_backend_tag, _E
// main parallel_for
auto __event = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
auto __result_acc = __result_storage.__get_result_acc(__cgh);
auto __result_acc = __result_storage.__get_result_acc<sycl::access_mode::write>(__cgh);

__cgh.parallel_for<KernelName>(
sycl::nd_range</*dim=*/1>(sycl::range</*dim=*/1>(__wgroup_size), sycl::range</*dim=*/1>(__wgroup_size)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize,

sycl::event __reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer
auto __res_acc = __scratch_container.__get_result_acc(__cgh);
auto __res_acc = __scratch_container.__get_result_acc<sycl::access_mode::write>(__cgh);
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
__cgh.parallel_for<_Name...>(
Expand Down Expand Up @@ -208,7 +208,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __temp_acc = __scratch_container.__get_scratch_acc<sycl::access_mode::write>(__cgh);
__cgh.parallel_for<_KernelName...>(
sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)),
[=](sycl::nd_item<1> __item_id) {
Expand Down Expand Up @@ -253,8 +253,10 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
__reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) {
__cgh.depends_on(__reduce_event);

// TODO what is correct access mode here for __get_scratch_acc call?
// Is default sycl::access_mode::read_write is ok?
Comment on lines +256 to +257
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is read access.

auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __res_acc = __scratch_container.__get_result_acc(__cgh);
auto __res_acc = __scratch_container.__get_result_acc<sycl::access_mode::write>(__cgh);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__work_group_size), __cgh);

__cgh.parallel_for<_KernelName...>(
Expand Down Expand Up @@ -358,7 +360,11 @@ struct __parallel_transform_reduce_impl
__reduce_event = __exec.queue().submit([&, __is_first, __offset_1, __offset_2, __n,
__n_groups](sycl::handler& __cgh) {
__cgh.depends_on(__reduce_event);
// TODO what is correct access mode here for __get_scratch_acc call?
// Is default sycl::access_mode::read_write is ok?
Comment on lines +363 to +364
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i do think read write is appropriate here.

auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
// TODO what is correct access mode here for __get_result_acc call?
// Is default sycl::access_mode::read_write is ok?
Comment on lines +366 to +367
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is write.

auto __res_acc = __scratch_container.__get_result_acc(__cgh);

// get an access to data under SYCL buffer
Expand All @@ -377,7 +383,8 @@ struct __parallel_transform_reduce_impl
[=](sycl::nd_item<1> __item_id) {
auto __temp_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__temp_acc);
auto __res_ptr =
__result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__res_acc, 2 * __n_groups);
__result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr<sycl::access_mode::write>(
__res_acc, 2 * __n_groups);
Comment on lines +386 to +387
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i dont think we need an access mode here.

auto __local_idx = __item_id.get_local_id(0);
auto __group_idx = __item_id.get_group(0);
// 1. Initialization (transform part). Fill local memory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,11 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
__dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local, __cgh);
__cgh.depends_on(__prior_event);
oneapi::dpl::__ranges::__require_access(__cgh, __in_rng);
auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __temp_acc = __scratch_container.__get_scratch_acc<sycl::access_mode::write>(__cgh);
__cgh.parallel_for<_KernelName...>(
__nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
_InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
_InitValueType* __temp_ptr =
_TmpStorageAcc::__get_usm_or_buffer_accessor_ptr<sycl::access_mode::write>(__temp_acc);
Comment on lines +304 to +305
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need an access mode here.

std::size_t __group_id = __ndi.get_group(0);
__dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
Expand Down Expand Up @@ -453,8 +454,8 @@ struct __parallel_reduce_then_scan_scan_submitter<
__dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
__cgh.depends_on(__prior_event);
oneapi::dpl::__ranges::__require_access(__cgh, __in_rng, __out_rng);
auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __res_acc = __scratch_container.__get_result_acc(__cgh);
auto __temp_acc = __scratch_container.__get_scratch_acc<sycl::access_mode::read_write>(__cgh);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be just read access.

auto __res_acc = __scratch_container.__get_result_acc<sycl::access_mode::write>(__cgh);

__cgh.parallel_for<_KernelName...>(
__nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
Expand Down
39 changes: 29 additions & 10 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -478,12 +478,11 @@ using __repacked_tuple_t = typename __repacked_tuple<T>::type;
template <typename _ContainerOrIterable>
using __value_t = typename __internal::__memobj_traits<_ContainerOrIterable>::value_type;

template <typename _T>
template <typename _T, sycl::access_mode _AccessMode = sycl::access_mode::read_write>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we gain anything from this default template argument. All the usages specify explicitly at this level.

struct __usm_or_buffer_accessor
{
private:
using __accessor_t = sycl::accessor<_T, 1, sycl::access::mode::read_write, __dpl_sycl::__target_device,
sycl::access::placeholder::false_t>;
using __accessor_t = sycl::accessor<_T, 1, _AccessMode, __dpl_sycl::__target_device, sycl::access::placeholder::false_t>;
__accessor_t __acc;
_T* __ptr = nullptr;
bool __usm = false;
Expand All @@ -492,11 +491,12 @@ struct __usm_or_buffer_accessor
public:
// Buffer accessor
__usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf)
: __acc(sycl::accessor(*__sycl_buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{}))
: __acc(sycl::accessor(*__sycl_buf, __cgh, __get_access_mode_tag(), __dpl_sycl::__no_init{}))
{
}
__usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf, size_t __acc_offset)
: __acc(sycl::accessor(*__sycl_buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{})), __offset(__acc_offset)
: __acc(sycl::accessor(*__sycl_buf, __cgh, __get_access_mode_tag(), __dpl_sycl::__no_init{})),
__offset(__acc_offset)
{
}

Expand All @@ -512,6 +512,23 @@ struct __usm_or_buffer_accessor
{
return __usm ? __ptr + __offset : &__acc[__offset];
}

private:

static auto __get_access_mode_tag()
{
if constexpr (_AccessMode == sycl::access::mode::read)
return sycl::read;

else if constexpr (_AccessMode == sycl::access::mode::write)
return sycl::write;

else if constexpr (_AccessMode == sycl::access::mode::read_write)
return sycl::read_write;

else
static_assert(false, "Unknown _AccessMode state");
}
Comment on lines +518 to +531
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be a general utility function outside of this structure (with a template arg)? It's useful in the structure but it seems to have general applicability. I'm a little surprised this utility doesn't exist in the specification itself actually.

Also the function itself should be able to be constexpr.

};

template <typename _ExecutionPolicy, typename _T>
Expand Down Expand Up @@ -611,27 +628,29 @@ struct __result_and_scratch_storage
#endif
}

template <sycl::access_mode _AccessMode = sycl::access_mode::read_write>
auto
__get_result_acc(sycl::handler& __cgh) const
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
if (__use_USM_host && __supports_USM_device)
return __usm_or_buffer_accessor<_T>(__cgh, __result_buf.get());
return __usm_or_buffer_accessor<_T, _AccessMode>(__cgh, __result_buf.get());
else if (__supports_USM_device)
return __usm_or_buffer_accessor<_T>(__cgh, __scratch_buf.get(), __scratch_n);
return __usm_or_buffer_accessor<_T>(__cgh, __sycl_buf.get(), __scratch_n);
return __usm_or_buffer_accessor<_T, _AccessMode>(__cgh, __scratch_buf.get(), __scratch_n);
return __usm_or_buffer_accessor<_T, _AccessMode>(__cgh, __sycl_buf.get(), __scratch_n);
#else
return sycl::accessor(*__sycl_buf.get(), __cgh, sycl::read_write, __dpl_sycl::__no_init{});
#endif
}

template <sycl::access_mode _AccessMode = sycl::access_mode::read_write>
auto
__get_scratch_acc(sycl::handler& __cgh) const
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
if (__use_USM_host || __supports_USM_device)
return __usm_or_buffer_accessor<_T>(__cgh, __scratch_buf.get());
return __usm_or_buffer_accessor<_T>(__cgh, __sycl_buf.get());
return __usm_or_buffer_accessor<_T, _AccessMode>(__cgh, __scratch_buf.get());
return __usm_or_buffer_accessor<_T, _AccessMode>(__cgh, __sycl_buf.get());
#else
return sycl::accessor(*__sycl_buf.get(), __cgh, sycl::read_write, __dpl_sycl::__no_init{});
#endif
Expand Down
Loading