Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabling reduce_then_scan for "Set" family of scan APIs #1879

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
48 changes: 6 additions & 42 deletions include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -1690,32 +1690,11 @@ template <typename _BackendTag, typename _ExecutionPolicy, typename _ForwardIter
_OutputIterator
__pattern_hetero_set_op(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIterator1 __first1,
_ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2,
_OutputIterator __result, _Compare __comp, _IsOpDifference)
_OutputIterator __result, _Compare __comp, _IsOpDifference __is_op_difference)
{
typedef typename std::iterator_traits<_ForwardIterator1>::difference_type _Size1;
typedef typename std::iterator_traits<_ForwardIterator2>::difference_type _Size2;
typedef typename ::std::iterator_traits<_ForwardIterator1>::difference_type _Size1;

const _Size1 __n1 = __last1 - __first1;
const _Size2 __n2 = __last2 - __first2;

//Algo is based on the recommended approach of set_intersection algo for GPU: binary search + scan (copying by mask).
using _ReduceOp = std::plus<_Size1>;
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
using _MaskAssigner = unseq_backend::__mask_assigner<2>;
using _InitType = unseq_backend::__no_init_value<_Size1>;
using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;

_ReduceOp __reduce_op;
_Assigner __assign_op;
_DataAcc __get_data_op;
unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign, /*inclusive*/ std::true_type, 2>
__copy_by_mask_op;
unseq_backend::__brick_set_op<_ExecutionPolicy, _Compare, _Size1, _Size2, _IsOpDifference> __create_mask_op{
__comp, __n1, __n2};

// temporary buffer to store boolean mask
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n1);

auto __keep1 =
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator1>();
Expand All @@ -1727,25 +1706,10 @@ __pattern_hetero_set_op(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _F
auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutputIterator>();
auto __buf3 = __keep3(__result, __result + __n1);

auto __result_size =
__par_backend_hetero::__parallel_transform_scan_base(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::make_zip_view(
__buf1.all_view(), __buf2.all_view(),
oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
__buf3.all_view(), _InitType{},
// local scan
unseq_backend::__scan</*inclusive*/ std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
_MaskAssigner, decltype(__create_mask_op), _InitType>{
__reduce_op, __get_data_op, __assign_op, _MaskAssigner{}, __create_mask_op},
// scan between groups
unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign,
_Assigner, _DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op,
__get_data_op},
// global scan
__copy_by_mask_op)
.get();
auto __result_size = __par_backend_hetero::__parallel_set_op(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
__buf1.all_view(), __buf2.all_view(),
__buf3.all_view(), __comp, __is_op_difference)
.get();

return __result + __result_size;
}
Expand Down
193 changes: 183 additions & 10 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -738,12 +738,12 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
}
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _InitType,
typename _LocalScan, typename _GroupScan, typename _GlobalScan>
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _InitType, typename _LocalScan,
typename _GroupScan, typename _GlobalScan>
auto
__parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __in_rng, _Range2&& __out_rng, _InitType __init,
_LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan)
_Range1&& __in_rng, _Range2&& __out_rng, _InitType __init, _LocalScan __local_scan,
_GroupScan __group_scan, _GlobalScan __global_scan)
{
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;

Expand Down Expand Up @@ -798,16 +798,17 @@ struct __simple_write_to_id
}
};

template <typename _Predicate>
template <typename _Predicate, typename _RangeTransform = oneapi::dpl::__internal::__no_op>
struct __gen_mask
{
template <typename _InRng>
bool
operator()(const _InRng& __in_rng, std::size_t __id) const
operator()(_InRng&& __in_rng, std::size_t __id) const
{
return __pred(__in_rng[__id]);
return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]);
}
_Predicate __pred;
_RangeTransform __rng_transform;
};

template <typename _BinaryPredicate>
Expand All @@ -825,6 +826,70 @@ struct __gen_unique_mask
_BinaryPredicate __pred;
};

template <typename _IsOpDifference, typename _Compare>
struct __gen_set_mask
{
template <typename _InRng>
bool
operator()(const _InRng& __in_rng, std::size_t __id) const
{
// First we must extract individual sequences from zip iterator because they may not have the same length,
// dereferencing is dangerous
auto __set_a = std::get<0>(__in_rng.tuple()); // first sequence
auto __set_b = std::get<1>(__in_rng.tuple()); // second sequence
auto __set_mask = std::get<2>(__in_rng.tuple()); // mask sequence

std::size_t __nb = __set_b.size();

auto __val_a = __set_a[__id];

auto __res = oneapi::dpl::__internal::__pstl_lower_bound(__set_b, std::size_t{0}, __nb, __val_a, __comp);

bool bres =
_IsOpDifference::value; //initialization is true in case of difference operation; false - intersection.
if (__res == __nb || __comp(__val_a, __set_b[__res]))
{
// there is no __val_a in __set_b, so __set_b in the difference {__set_a}/{__set_b};
}
else
{
auto __val_b = __set_b[__res];

//Difference operation logic: if number of duplication in __set_a on left side from __id > total number of
//duplication in __set_b then a mask is 1

//Intersection operation logic: if number of duplication in __set_a on left side from __id <= total number of
//duplication in __set_b then a mask is 1

const std::size_t __count_a_left =
__id - oneapi::dpl::__internal::__pstl_left_bound(__set_a, std::size_t{0}, __id, __val_a, __comp) + 1;

const std::size_t __count_b =
oneapi::dpl::__internal::__pstl_right_bound(__set_b, __res, __nb, __val_b, __comp) -
oneapi::dpl::__internal::__pstl_left_bound(__set_b, std::size_t{0}, __res, __val_b, __comp);

if constexpr (_IsOpDifference::value)
bres = __count_a_left > __count_b; /*difference*/
else
bres = __count_a_left <= __count_b; /*intersection*/
}
__set_mask[__id] = bres;
return bres;
}
_Compare __comp;
};

template <std::size_t I>
struct __extract_range_from_zip
{
template <typename _InRng>
auto
operator()(const _InRng& __in_rng) const
{
return std::get<I>(__in_rng.tuple());
}
};

template <typename _GenMask>
struct __gen_count_mask
{
Expand All @@ -837,22 +902,24 @@ struct __gen_count_mask
_GenMask __gen_mask;
};

template <typename _GenMask>
template <typename _GenMask, typename _RangeTransform = oneapi::dpl::__internal::__no_op>
struct __gen_expand_count_mask
{
template <typename _InRng, typename _SizeType>
auto
operator()(_InRng&& __in_rng, _SizeType __id) const
{
auto __transformed_input = __rng_transform(__in_rng);
// Explicitly creating this element type is necessary to avoid modifying the input data when _InRng is a
// zip_iterator which will return a tuple of references when dereferenced. With this explicit type, we copy
// the values of zipped input types rather than their references.
using _ElementType = oneapi::dpl::__internal::__value_t<_InRng>;
_ElementType ele = __in_rng[__id];
using _ElementType = oneapi::dpl::__internal::__value_t<decltype(__transformed_input)>;
_ElementType ele = __transformed_input[__id];
bool mask = __gen_mask(std::forward<_InRng>(__in_rng), __id);
return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele);
}
_GenMask __gen_mask;
_RangeTransform __rng_transform;
};

struct __get_zeroth_element
Expand Down Expand Up @@ -1197,6 +1264,112 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
}
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference)
{
// fill in reduce then scan impl
using _GenMaskReduce = oneapi::dpl::__par_backend_hetero::__gen_set_mask<_IsOpDifference, _Compare>;
using _MaskRangeTransform = oneapi::dpl::__par_backend_hetero::__extract_range_from_zip<2>;
using _MaskPredicate = oneapi::dpl::__internal::__no_op;
using _GenMaskScan = oneapi::dpl::__par_backend_hetero::__gen_mask<_MaskPredicate, _MaskRangeTransform>;
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, oneapi::dpl::__internal::__pstl_assign>;
using _Size = oneapi::dpl::__internal::__difference_t<_Range3>;
using _ScanRangeTransform = oneapi::dpl::__par_backend_hetero::__extract_range_from_zip<0>;

using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMaskReduce>;
using _ReduceOp = std::plus<_Size>;
using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMaskScan, _ScanRangeTransform>;
using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element;

oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, std::int32_t> __mask_buf(__exec, __rng1.size());

return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::make_zip_view(
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
oneapi::dpl::__ranges::all_view<std::int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
std::forward<_Range3>(__result), _GenReduceInput{_GenMaskReduce{__comp}}, _ReduceOp{},
_GenScanInput{_GenMaskScan{_MaskPredicate{}, _MaskRangeTransform{}}, _ScanRangeTransform{}},
_ScanInputTransform{}, _WriteOp{}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
/*_Inclusive=*/std::true_type{}, /*__is_unique_pattern=*/std::false_type{});
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference __is_op_difference)
{
using _Size1 = oneapi::dpl::__internal::__difference_t<_Range1>;
using _Size2 = oneapi::dpl::__internal::__difference_t<_Range2>;

_Size1 __n1 = __rng1.size();
_Size2 __n2 = __rng2.size();

//Algo is based on the recommended approach of set_intersection algo for GPU: binary search + scan (copying by mask).
using _ReduceOp = ::std::plus<_Size1>;
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
using _MaskAssigner = unseq_backend::__mask_assigner<2>;
using _InitType = unseq_backend::__no_init_value<_Size1>;
using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;

_ReduceOp __reduce_op;
_Assigner __assign_op;
_DataAcc __get_data_op;
unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign, /*inclusive*/ ::std::true_type, 2>
__copy_by_mask_op;
unseq_backend::__brick_set_op<_ExecutionPolicy, _Compare, _Size1, _Size2, _IsOpDifference> __create_mask_op{
__comp, __n1, __n2};

// temporary buffer to store boolean mask
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n1);

return __par_backend_hetero::__parallel_transform_scan_base(
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::make_zip_view(
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
std::forward<_Range3>(__result), _InitType{},
// local scan
unseq_backend::__scan</*inclusive*/ std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
_MaskAssigner, decltype(__create_mask_op), _InitType>{
__reduce_op, __get_data_op, __assign_op, _MaskAssigner{}, __create_mask_op},
// scan between groups
unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign, _Assigner,
_DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, __get_data_op},
// global scan
__copy_by_mask_op);
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference __is_op_difference)
{
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
{
return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
std::forward<_Range3>(__result), __comp, __is_op_difference);
}
else
{
return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
__is_op_difference);
}
}

//------------------------------------------------------------------------
// find_or tags
//------------------------------------------------------------------------
Expand Down
13 changes: 7 additions & 6 deletions include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ namespace oneapi::dpl::__par_backend_hetero
template <typename _UnaryOp>
struct __gen_transform_input;

template <typename _Predicate>
template <typename _Predicate, typename _RangeTransform>
struct __gen_mask;

template <typename _BinaryPredicate>
Expand All @@ -245,7 +245,7 @@ struct __gen_unique_mask;
template <typename _GenMask>
struct __gen_count_mask;

template <typename _GenMask>
template <typename _GenMask, typename _RangeTransform>
struct __gen_expand_count_mask;

template <int32_t __offset, typename _Assign>
Expand All @@ -266,8 +266,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
{
};

template <typename _Predicate>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate)>
template <typename _Predicate, typename _RangeTransform>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate,
_RangeTransform)>
: oneapi::dpl::__internal::__are_all_device_copyable<_Predicate>
{
};
Expand All @@ -284,9 +285,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
{
};

template <typename _GenMask>
template <typename _GenMask, typename _RangeTransform>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask,
_GenMask)>
_GenMask, _RangeTransform)>
: oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
{
};
Expand Down
2 changes: 1 addition & 1 deletion include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ struct __brick_includes
auto __val_a = __a[__res];

//searching number of duplication
const auto __count_a = __internal::__pstl_right_bound(__a, __res, __a_end, __val_a, __comp) - __res + __res -
const auto __count_a = __internal::__pstl_right_bound(__a, __res, __a_end, __val_a, __comp) -
__internal::__pstl_left_bound(__a, __a_beg, __res, __val_a, __comp);

const auto __count_b = __internal::__pstl_right_bound(__b, _Size2(__idx_b), __b_end, __val_b, __comp) -
Expand Down
Loading