Skip to content

Commit

Permalink
Deprecate redundant team-level sort functions (#2306)
Browse files Browse the repository at this point in the history
* Deprecate redundant team-level sort functions

These were moved into Kokkos core a long time ago with a nicer
interface and better testing. Replace our implementations with calls
to the Kokkos functions like Kokkos::Experimental::sort_team.

* Formatting

Signed-off-by: Brian Kelley <bmkelle@sandia.gov>

* Use our own DefaultComparator, not use Kokkos impl

---------

Signed-off-by: Brian Kelley <bmkelle@sandia.gov>
  • Loading branch information
brian-kelley authored Aug 12, 2024
1 parent eca90cf commit 0a6a112
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 320 deletions.
192 changes: 17 additions & 175 deletions common/src/KokkosKernels_Sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#define _KOKKOSKERNELS_SORTING_HPP

#include "Kokkos_Core.hpp"
#include "Kokkos_Sort.hpp"
#include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum
#include "KokkosKernels_ExecSpaceUtils.hpp" //for kk_is_gpu_exec_space
#include <type_traits>
Expand Down Expand Up @@ -59,30 +60,13 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value
// Team-level parallel sorting (callable inside any TeamPolicy kernel)
// -------------------------------------------------------------------

// Comparison based sorting that uses the entire team (described by mem) to sort
// raw array according to the comparator.
template <typename Ordinal, typename ValueType, typename TeamMember,
typename Comparator = Impl::DefaultComparator<ValueType>>
KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
const Comparator& comp = Comparator());

// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
// values[0...n].
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember,
typename Comparator = Impl::DefaultComparator<ValueType>>
KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem,
const Comparator& comp = Comparator());

namespace Impl {

// Functor that sorts a view on one team
template <typename View, typename Ordinal, typename TeamMember, typename Comparator>
struct BitonicSingleTeamFunctor {
BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {}
KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data(), v.extent(0), t,
comp);
};
KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Kokkos::Experimental::sort_team(t, v, comp); };
View v;
Comparator comp;
};
Expand All @@ -97,8 +81,7 @@ struct BitonicChunkFunctor {
Ordinal chunkStart = chunk * chunkSize;
Ordinal n = chunkSize;
if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart;
KokkosKernels::TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data() + chunkStart, n,
t, comp);
Kokkos::Experimental::sort_team(t, Kokkos::subview(v, Kokkos::make_pair(chunkStart, chunkStart + n)), comp);
};
View v;
Comparator comp;
Expand Down Expand Up @@ -217,10 +200,11 @@ void bitonicSort(View v, const Comparator& comp) {
Ordinal npot = 1;
while (npot < n) npot <<= 1;
// Partition the data equally among fixed number of teams
Ordinal chunkSize = 512;
Ordinal numTeams = npot / chunkSize;
Ordinal chunkSize = 512;
Ordinal numTeamsChunkSort = (n + chunkSize - 1) / chunkSize;
Ordinal numTeams = npot / chunkSize;
// First, sort within teams
Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
Kokkos::parallel_for(team_policy(numTeamsChunkSort, Kokkos::AUTO()),
Impl::BitonicChunkFunctor<View, Ordinal, team_member, Comparator>(v, comp, chunkSize));
for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) {
Ordinal boxSize = teamsPerBox * chunkSize;
Expand Down Expand Up @@ -388,165 +372,23 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value
// trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and
// memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs
// Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter
template <typename Ordinal, typename ValueType, typename TeamMember, typename Comparator>
KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
const Comparator& comp) {
// Algorithm only works on power-of-two input size only.
// If n is not a power-of-two, will implicitly pretend
// that values[i] for i >= n is just the max for ValueType, so it never gets
// swapped
Ordinal npot = 1;
Ordinal levels = 0;
while (npot < n) {
levels++;
npot <<= 1;
}
for (Ordinal i = 0; i < levels; i++) {
for (Ordinal j = 0; j <= i; j++) {
// n/2 pairs of items are compared in parallel
Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) {
// How big are the brown/pink boxes?
Ordinal boxSize = Ordinal(2) << (i - j);
// Which box contains this thread?
Ordinal boxID = t >> (i - j); // t * 2 / boxSize;
Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize
Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize /
// 2;
Ordinal elem1 = boxStart + boxOffset;
if (j == 0) {
// first phase (brown box): within a block, compare with the
// opposite value in the box
Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
if (elem2 < n) {
// both elements in bounds, so compare them and swap if out of
// order
if (comp(values[elem2], values[elem1])) {
ValueType temp = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp;
}
}
} else {
// later phases (pink box): within a block, compare with fixed
// distance (boxSize / 2) apart
Ordinal elem2 = elem1 + boxSize / 2;
if (elem2 < n) {
if (comp(values[elem2], values[elem1])) {
ValueType temp = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp;
}
}
}
});
mem.team_barrier();
}
}
}

// Sort "values", while applying the same swaps to "perm"
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember, typename Comparator>
KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem,
const Comparator& comp) {
// Algorithm only works on power-of-two input size only.
// If n is not a power-of-two, will implicitly pretend
// that values[i] for i >= n is just the max for ValueType, so it never gets
// swapped
Ordinal npot = 1;
Ordinal levels = 0;
while (npot < n) {
levels++;
npot <<= 1;
}
for (Ordinal i = 0; i < levels; i++) {
for (Ordinal j = 0; j <= i; j++) {
// n/2 pairs of items are compared in parallel
Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) {
// How big are the brown/pink boxes?
Ordinal boxSize = Ordinal(2) << (i - j);
// Which box contains this thread?
Ordinal boxID = t >> (i - j); // t * 2 / boxSize;
Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize
Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize /
// 2;
Ordinal elem1 = boxStart + boxOffset;
if (j == 0) {
// first phase (brown box): within a block, compare with the
// opposite value in the box
Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
if (elem2 < n) {
// both elements in bounds, so compare them and swap if out of
// order
if (comp(values[elem2], values[elem1])) {
ValueType temp1 = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp1;
PermType temp2 = perm[elem1];
perm[elem1] = perm[elem2];
perm[elem2] = temp2;
}
}
} else {
// later phases (pink box): within a block, compare with fixed
// distance (boxSize / 2) apart
Ordinal elem2 = elem1 + boxSize / 2;
if (elem2 < n) {
if (comp(values[elem2], values[elem1])) {
ValueType temp1 = values[elem1];
values[elem1] = values[elem2];
values[elem2] = temp1;
PermType temp2 = perm[elem1];
perm[elem1] = perm[elem2];
perm[elem2] = temp2;
}
}
}
});
mem.team_barrier();
}
}
}

// For backward compatibility: keep the public interface accessible in
// KokkosKernels::Impl::
namespace Impl {

template <typename View, typename ExecSpace, typename Ordinal,
typename Comparator = Impl::DefaultComparator<typename View::value_type>>
[[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) {
KokkosKernels::bitonicSort<View, ExecSpace, Ordinal, Comparator>(v, comp);
}

template <typename Ordinal, typename ValueType>
[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) {
KokkosKernels::SerialRadixSort<Ordinal, ValueType>(values, valuesAux, n);
}

// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
// values[0...n].
template <typename Ordinal, typename ValueType, typename PermType>
[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm,
PermType* permAux, Ordinal n) {
KokkosKernels::SerialRadixSort2<Ordinal, ValueType, PermType>(values, valuesAux, perm, permAux, n);
}

template <typename Ordinal, typename ValueType, typename TeamMember,
typename Comparator = Impl::DefaultComparator<ValueType>>
[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem,
const Comparator& comp = Comparator()) {
KokkosKernels::TeamBitonicSort<Ordinal, ValueType, TeamMember, Comparator>(values, n, mem, comp);
[[deprecated("Use Kokkos::Experimental::sort_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(
ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) {
Kokkos::View<ValueType*, Kokkos::AnonymousSpace> valuesView(values, n);
Kokkos::Experimental::sort_team(mem, valuesView, comp);
}

// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts
// values[0...n].
// Sort "values", while applying the same swaps to "perm"
template <typename Ordinal, typename ValueType, typename PermType, typename TeamMember,
typename Comparator = Impl::DefaultComparator<ValueType>>
[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n,
const TeamMember mem,
const Comparator& comp = Comparator()) {
KokkosKernels::TeamBitonicSort2<Ordinal, ValueType, PermType, TeamMember, Comparator>(values, perm, n, mem, comp);
[[deprecated("Use Kokkos::Experimental::sort_by_key_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(
ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) {
Kokkos::View<ValueType*, Kokkos::AnonymousSpace> valuesView(values, n);
Kokkos::View<PermType*, Kokkos::AnonymousSpace> permView(perm, n);
Kokkos::Experimental::sort_by_key_team(mem, valuesView, permView, comp);
}
} // namespace Impl

} // namespace KokkosKernels

Expand Down
140 changes: 0 additions & 140 deletions common/unit_test/Test_Common_Sorting.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,125 +248,6 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) {
}
}

template <typename ValView, typename OrdView>
struct TestTeamBitonicFunctor {
typedef typename ValView::value_type Value;

TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_)
: values(values_), counts(counts_), offsets(offsets_) {}

template <typename TeamMem>
KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
int i = t.league_rank();
KokkosKernels::TeamBitonicSort<int, Value, TeamMem>(values.data() + offsets(i), counts(i), t);
}

ValView values;
OrdView counts;
OrdView offsets;
};

template <typename KeyView, typename ValView, typename OrdView>
struct TestTeamBitonic2Functor {
typedef typename KeyView::value_type Key;
typedef typename ValView::value_type Value;

TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_)
: keys(keys_), values(values_), counts(counts_), offsets(offsets_) {}

template <typename TeamMem>
KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const {
int i = t.league_rank();
KokkosKernels::TeamBitonicSort2<int, Key, Value, TeamMem>(keys.data() + offsets(i), values.data() + offsets(i),
counts(i), t);
}

KeyView keys;
ValView values;
OrdView counts;
OrdView offsets;
};

template <typename Device, typename Scalar>
void testTeamBitonicSort(size_t k, size_t subArraySize) {
// Create a view of randomized data
typedef typename Device::execution_space exec_space;
typedef typename Device::memory_space mem_space;
typedef Kokkos::View<int*, mem_space> OrdView;
typedef Kokkos::View<Scalar*, mem_space> ValView;
OrdView counts("Subarray Sizes", k);
OrdView offsets("Subarray Offsets", k);
// Generate k sub-array sizes, each with size about 20
size_t n = generateRandomOffsets<OrdView, exec_space>(counts, offsets, k, subArraySize);
ValView data("Bitonic sort testing data", n);
fillRandom(data);
Kokkos::View<Scalar*, Kokkos::HostSpace> gold("Host sorted", n);
Kokkos::deep_copy(gold, data);
// Run the sorting on device in all sub-arrays in parallel
Kokkos::parallel_for(Kokkos::TeamPolicy<exec_space>(k, Kokkos::AUTO()),
TestTeamBitonicFunctor<ValView, OrdView>(data, counts, offsets));
// Copy result to host
auto dataHost = Kokkos::create_mirror_view(data);
Kokkos::deep_copy(dataHost, data);
// Sort using std::sort on host to do correctness test
exec_space().fence();
auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
for (size_t i = 0; i < k; i++) {
Scalar* begin = gold.data() + offsetsHost(i);
Scalar* end = begin + countsHost(i);
std::sort(begin, end);
}
for (size_t i = 0; i < n; i++) {
ASSERT_EQ(dataHost(i), gold(i));
}
}

template <typename Device, typename Key, typename Value>
void testTeamBitonicSort2(size_t k, size_t subArraySize) {
// Create a view of randomized data
typedef typename Device::execution_space exec_space;
typedef typename Device::memory_space mem_space;
typedef Kokkos::View<int*, mem_space> OrdView;
typedef Kokkos::View<Key*, mem_space> KeyView;
typedef Kokkos::View<Value*, mem_space> ValView;
OrdView counts("Subarray Sizes", k);
OrdView offsets("Subarray Offsets", k);
// Generate k sub-array sizes, each with size about 20
size_t n = generateRandomOffsets<OrdView, exec_space>(counts, offsets, k, subArraySize);
KeyView keys("Bitonic test keys", n);
ValView data("Bitonic test data", n);
// The keys are randomized
fillRandom(keys, data);
Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
Kokkos::deep_copy(gold, keys);
// Run the sorting on device in all sub-arrays in parallel, just using vector
// loops Deliberately using a weird number for vector length
Kokkos::parallel_for(Kokkos::TeamPolicy<exec_space>(k, Kokkos::AUTO()),
TestTeamBitonic2Functor<KeyView, ValView, OrdView>(keys, data, counts, offsets));
exec_space().fence();
auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
// Sort using std::sort on host to do correctness test
for (size_t i = 0; i < k; i++) {
Key* begin = gold.data() + offsetsHost(i);
Key* end = begin + countsHost(i);
std::sort(begin, end);
}
// Copy results to host
auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
// Make sure keys are sorted exactly (stability of sort doesn't matter)
for (size_t i = 0; i < n; i++) {
ASSERT_EQ(keysHost(i), gold(i));
}
// Make sure the hashes of each key still matches the corresponding value
for (size_t i = 0; i < n; i++) {
auto correctHash = kvHash<Key, Value>()(keysHost(i));
ASSERT_EQ(dataHost(i), correctHash);
}
}

template <typename View>
struct CheckSortedFunctor {
CheckSortedFunctor(View& v_) : v(v_) {}
Expand Down Expand Up @@ -480,27 +361,6 @@ TEST_F(TestCategory, common_serial_radix2) {
}
}

TEST_F(TestCategory, common_team_bitonic) {
// Test team-level bitonic over some contiguous medium arrays
// 1st arg is #arrays, 2nd arg is max subarray size
size_t numArrays = 20;
for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) {
testTeamBitonicSort<TestDevice, char>(numArrays, arrayMax);
testTeamBitonicSort<TestDevice, int>(numArrays, arrayMax);
}
}

TEST_F(TestCategory, common_team_bitonic2) {
// Test team-level bitonic over some contiguous medium arrays
// 1st arg is #arrays, 2nd arg is max subarray size
size_t numArrays = 20;
for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) {
testTeamBitonicSort2<TestDevice, char, int>(numArrays, arrayMax);
testTeamBitonicSort2<TestDevice, int, double>(numArrays, arrayMax);
testTeamBitonicSort2<TestDevice, int, Kokkos::complex<double>>(numArrays, arrayMax);
}
}

TEST_F(TestCategory, common_device_bitonic) {
// Test device-level bitonic with some larger arrays
testBitonicSort<TestDevice, char>(243743);
Expand Down
Loading

0 comments on commit 0a6a112

Please sign in to comment.