Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix major bug in TeamBitonicSort2. #544

Merged
merged 6 commits into from
Dec 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions src/blas/impl/KokkosBlas3_gemm_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Layout,bl
} else {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
#ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_j = offset_j+j;
int idx_j = offset_j+j;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI for other readers: See #543 (comment) .

#endif
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
#ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_j = offset_j+j;
int idx_j = offset_j+j;
#endif
const int idx_i = offset_i+i;
A_scr(i,j) = idx_i<A.extent_int(0) && idx_j<A.extent_int(1) ? A(idx_i,idx_j) : ATV::zero();
Expand Down Expand Up @@ -131,11 +131,11 @@ struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Kokkos::L
} else {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
#ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_i = offset_i+i;
int idx_i = offset_i+i;
#endif
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
#ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_i = offset_i+i;
int idx_i = offset_i+i;
#endif
const int idx_j = offset_j+j;
A_scr(i,j) = idx_i<A.extent_int(0) && idx_j<A.extent_int(1) ? A(idx_i,idx_j) : ATV::zero();
Expand Down Expand Up @@ -168,11 +168,11 @@ struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Layout,bl
} else {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
#ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_j = offset_j+j;
int idx_j = offset_j+j;
#endif
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
#ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_j = offset_j+j;
int idx_j = offset_j+j;
#endif
const int idx_i = offset_i+i;
A_scr(i,j) = idx_i<A.extent_int(1) && idx_j<A.extent_int(0) ? A(idx_j,idx_i) : ATV::zero();
Expand Down Expand Up @@ -205,11 +205,11 @@ struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Kokkos::L
} else {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
#ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_i = offset_i+i;
int idx_i = offset_i+i;
#endif
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
#ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_i = offset_i+i;
int idx_i = offset_i+i;
#endif
const int idx_j = offset_j+j;
A_scr(i,j) = idx_i<A.extent_int(1) && idx_j<A.extent_int(0) ? A(idx_j,idx_i) : ATV::zero();
Expand Down Expand Up @@ -242,11 +242,11 @@ struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Layout,bl
} else {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_j), [&] (const int j) {
#ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_j = offset_j+j;
int idx_j = offset_j+j;
#endif
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_i), [&] (const int i) {
#ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_j = offset_j+j;
int idx_j = offset_j+j;
#endif
const int idx_i = offset_i+i;
A_scr(i,j) = idx_i<A.extent_int(1) && idx_j<A.extent_int(0) ? ATV::conj(A(idx_j,idx_i)) : ATV::zero();
Expand Down Expand Up @@ -279,11 +279,11 @@ struct impl_deep_copy_matrix_block<TeamHandle,ViewTypeScratch,ViewType,Kokkos::L
} else {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,blockDim_i), [&] (const int i) {
#ifndef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_i = offset_i+i;
int idx_i = offset_i+i;
#endif
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,blockDim_j), [&] (const int j) {
#ifdef KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND
const int idx_i = offset_i+i;
int idx_i = offset_i+i;
#endif
const int idx_j = offset_j+j;
A_scr(i,j) = idx_i<A.extent_int(1) && idx_j<A.extent_int(0) ? ATV::conj(A(idx_j,idx_i)) : ATV::zero();
Expand Down
14 changes: 7 additions & 7 deletions src/common/KokkosKernels_Sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ namespace KokkosKernels {
namespace Impl {

//Radix sort for integers, on a single thread within a team.
//Pros: few diverging branches, so OK for sorting on a single GPU thread/warp. Better on CPU cores.
//Pros: few diverging branches, so OK for sorting on a single GPU vector lane. Better on CPU cores.
//Con: requires auxiliary storage, and this version only works for integers
template<typename Ordinal, typename ValueType>
KOKKOS_INLINE_FUNCTION void
Expand Down Expand Up @@ -166,7 +166,7 @@ SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n)

//Radix sort for integers (no internal parallelism).
//While sorting, also permute "perm" array along with the values.
//Pros: few diverging branches, so good for sorting on a single GPU thread/warp.
//Pros: few diverging branches, so good for sorting on a single GPU vector lane.
//Con: requires auxiliary storage, this version only works for integers (although float/double is possible)
template<typename Ordinal, typename ValueType, typename PermType>
KOKKOS_INLINE_FUNCTION void
Expand Down Expand Up @@ -395,11 +395,11 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember
if(elem2 < n)
{
//both elements in bounds, so compare them and swap if out of order
if(comp(values[elem2], values[elem2]))
if(comp(values[elem2], values[elem1]))
{
ValueType temp = values[elem1];
ValueType temp1 = values[elem1];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would have been a good place for a Kokkos::swap, as a temporary replacement for std::swap. It could even be that CUDA supports std::swap on device now.

values[elem1] = values[elem2];
values[elem2] = temp;
values[elem2] = temp1;
PermType temp2 = perm[elem1];
perm[elem1] = perm[elem2];
perm[elem2] = temp2;
Expand All @@ -414,9 +414,9 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember
{
if(comp(values[elem2], values[elem1]))
{
ValueType temp = values[elem1];
ValueType temp1 = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp;
values[elem2] = temp1;
PermType temp2 = perm[elem1];
perm[elem1] = perm[elem2];
perm[elem2] = temp2;
Expand Down
Loading