diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp index 034bf4baf28..8a2fb4d11c0 100644 --- a/dpcpp/components/cooperative_groups.dp.hpp +++ b/dpcpp/components/cooperative_groups.dp.hpp @@ -162,20 +162,20 @@ class thread_block_tile : public sycl::sub_group { __dpct_inline__ unsigned size() const noexcept { return Size; } __dpct_inline__ void sync() const noexcept { this->barrier(); } - #define GKO_BIND_SHFL(ShflOpName, ShflOp) \ template \ __dpct_inline__ ValueType ShflOpName(ValueType var, SelectorType selector) \ const noexcept \ { \ - return this->ShflOp(var, selector); \ + return sycl::ShflOp(static_cast(*this), var, \ + selector); \ } \ static_assert(true, \ "This assert is used to counter the false positive extra " \ "semi-colon warnings") - GKO_BIND_SHFL(shfl, shuffle); - GKO_BIND_SHFL(shfl_xor, shuffle_xor); + GKO_BIND_SHFL(shfl, select_from_group); + GKO_BIND_SHFL(shfl_xor, permute_group_by_xor); // the shfl_up of out-of-range value gives undefined behavior, we // manually set it as the original value such that give the same result as @@ -184,7 +184,8 @@ class thread_block_tile : public sycl::sub_group { __dpct_inline__ ValueType shfl_up(ValueType var, SelectorType selector) const noexcept { - const auto result = this->shuffle_up(var, selector); + const auto result = sycl::shift_group_right( + static_cast(*this), var, selector); return (data_.rank < selector) ? var : result; } @@ -195,7 +196,8 @@ class thread_block_tile : public sycl::sub_group { __dpct_inline__ ValueType shfl_down(ValueType var, SelectorType selector) const noexcept { - const auto result = this->shuffle_down(var, selector); + const auto result = sycl::shift_group_left( + static_cast(*this), var, selector); return (data_.rank + selector >= Size) ? var : result; } diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.hpp b/dpcpp/preconditioner/batch_jacobi_kernels.hpp index 769ebc47a57..dd684350375 100644 --- a/dpcpp/preconditioner/batch_jacobi_kernels.hpp +++ b/dpcpp/preconditioner/batch_jacobi_kernels.hpp @@ -99,15 +99,15 @@ __dpct_inline__ int choose_pivot(const int block_size, sg.barrier(); int my_piv_idx = sg_tid; for (int a = sg_size / 2; a > 0; a /= 2) { - const real_type abs_ele_other = sg.shuffle_down(my_abs_ele, a); - const int piv_idx_other = sg.shuffle_down(my_piv_idx, a); + const real_type abs_ele_other = shift_group_left(sg, my_abs_ele, a); + const int piv_idx_other = shift_group_left(sg, my_piv_idx, a); if (my_abs_ele < abs_ele_other) { my_abs_ele = abs_ele_other; my_piv_idx = piv_idx_other; } } sg.barrier(); - const int ipiv = sg.shuffle(my_piv_idx, 0); + const int ipiv = select_from_group(sg, my_piv_idx, 0); return ipiv; } @@ -129,9 +129,9 @@ __dpct_inline__ void invert_dense_block(const int block_size, perm = k; } const ValueType d = - (sg.shuffle(block_row[k], ipiv) == zero()) + (select_from_group(sg, block_row[k], ipiv) == zero()) ? one() - : sg.shuffle(block_row[k], ipiv); + : select_from_group(sg, block_row[k], ipiv); // scale kth col block_row[k] /= -d; if (sg_tid == ipiv) { @@ -140,7 +140,8 @@ __dpct_inline__ void invert_dense_block(const int block_size, const ValueType row_val = block_row[k]; // rank-1 update for (int col = 0; col < block_size; col++) { - const ValueType col_val = sg.shuffle(block_row[col], ipiv); + const ValueType col_val = + select_from_group(sg, block_row[col], ipiv); block_row[col] += row_val * col_val; } // Computations for the threads of the subwarp having local id >= @@ -221,7 +222,7 @@ __dpct_inline__ void compute_block_jacobi_kernel( // array for (int a = 0; a < block_size; a++) { const int col_inv_transposed_mat = a; - const int col = sg.shuffle(perm, a); // column permutation + const int col = select_from_group(sg, perm, a); // column permutation const int row_inv_transposed_mat = perm; // accumulated row swaps during pivoting const auto val_to_write = block_row[col];