From dd48eda838cfdb13a8a751de5dbeb3e44bad3ffa Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 17 Dec 2019 15:48:16 -0700 Subject: [PATCH] Fixed sorting tests call RadixSort in a RangePolicy, since it no longer uses ThreadVectorRange loops internally --- src/common/KokkosKernels_Sorting.hpp | 4 +- test_common/Test_Common_Sorting.hpp | 103 ++++++++++++++------------- 2 files changed, 56 insertions(+), 51 deletions(-) diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index edb6b64498..dd4929758d 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -52,7 +52,7 @@ namespace KokkosKernels { namespace Impl { //Radix sort for integers, on a single thread within a team. -//Pros: few diverging branches, so OK for sorting on a single GPU thread/warp. Better on CPU cores. +//Pros: few diverging branches, so OK for sorting on a single GPU vector lane. Better on CPU cores. //Con: requires auxiliary storage, and this version only works for integers template KOKKOS_INLINE_FUNCTION void @@ -166,7 +166,7 @@ SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) //Radix sort for integers (no internal parallelism). //While sorting, also permute "perm" array along with the values. -//Pros: few diverging branches, so good for sorting on a single GPU thread/warp. +//Pros: few diverging branches, so good for sorting on a single GPU vector lane. //Con: requires auxiliary storage, this version only works for integers (although float/double is possible) template KOKKOS_INLINE_FUNCTION void diff --git a/test_common/Test_Common_Sorting.hpp b/test_common/Test_Common_Sorting.hpp index dc9ea16d5a..fbf4e53df4 100644 --- a/test_common/Test_Common_Sorting.hpp +++ b/test_common/Test_Common_Sorting.hpp @@ -60,11 +60,9 @@ //Then prefix-sum into randomOffsets. //This simulates a CRS rowmap or other batched sorting scenario template -size_t generateRandomOffsets(OrdView& randomCounts, OrdView& randomOffsets, size_t n, size_t avg) +size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, size_t n, size_t avg) { srand(54321); - randomCounts = OrdView("Counts", n); - randomOffsets = OrdView("Offsets", n); auto countsHost = Kokkos::create_mirror_view(randomCounts); size_t total = 0; for(size_t i = 0; i < n; i++) @@ -175,14 +173,9 @@ struct TestSerialRadixFunctor TestSerialRadixFunctor(ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_) : values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_) {} - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, counts.extent(0)), - [=](const int i) - { - KokkosKernels::Impl::SerialRadixSort(&values(offsets(i)), &valuesAux(offsets(i)), counts(i)); - }); + KokkosKernels::Impl::SerialRadixSort(values.data() + offsets(i), valuesAux.data() + offsets(i), counts(i)); } ValView values; ValView valuesAux; @@ -200,14 +193,10 @@ struct TestSerialRadix2Functor TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_) : keys(keys_), keysAux(keysAux_), values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_) {} - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, counts.extent(0)), - [=](const int i) - { - KokkosKernels::Impl::SerialRadixSort2(&keys(offsets(i)), &keysAux(offsets(i)), &values(offsets(i)), &valuesAux(offsets(i)), counts(i)); - }); + int off = offsets(i); + KokkosKernels::Impl::SerialRadixSort2(keys.data() + off, keysAux.data() + off, values.data() + off, valuesAux.data() + off, counts(i)); } KeyView keys; KeyView keysAux; @@ -222,16 +211,16 @@ void testSerialRadixSort(size_t k, size_t subArraySize) { //Create a view of randomized data typedef typename ExecSpace::memory_space mem_space; + std::cout << "Exec space is: " << typeid(ExecSpace).name() << '\n'; + std::cout << "Memory space is: " << typeid(mem_space).name() << '\n'; typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; - OrdView counts; - OrdView offsets; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); //Generate k sub-array sizes, each with size about 20 size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - auto countsHost = Kokkos::create_mirror_view(counts); - auto offsetsHost = Kokkos::create_mirror_view(offsets); - Kokkos::deep_copy(countsHost, counts); - Kokkos::deep_copy(offsetsHost, offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); KeyView keys("Radix sort testing data", n); fillRandom(keys); //Sort using std::sort on host to do correctness test @@ -244,9 +233,9 @@ void testSerialRadixSort(size_t k, size_t subArraySize) std::sort(begin, end); } KeyView keysAux("Radix sort aux data", n); - //Run the sorting on device in all sub-arrays in parallel, just using vector loops - typedef Kokkos::TeamPolicy team_policy; - Kokkos::parallel_for(team_policy(1, Kokkos::AUTO(), 32), + //Run the sorting on device in all sub-arrays in parallel + typedef Kokkos::RangePolicy range_policy; + Kokkos::parallel_for(range_policy(0, k), TestSerialRadixFunctor(keys, keysAux, counts, offsets)); //Copy result to host auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); @@ -264,29 +253,29 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; typedef Kokkos::View ValView; - OrdView counts; - OrdView offsets; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); //Generate k sub-array sizes, each with size about 20 size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = Kokkos::create_mirror_view(Kokkos::HostSpace(), offsets); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); KeyView keys("Radix test keys", n); ValView data("Radix test data", n); //The keys are randomized fillRandom(keys, data); KeyView keysAux("Radix sort aux keys", n); ValView dataAux("Radix sort aux data", n); - //Run the sorting on device in all sub-arrays in parallel, just using vector loops - typedef Kokkos::TeamPolicy team_policy; + //Run the sorting on device in all sub-arrays in parallel + typedef Kokkos::RangePolicy range_policy; //Deliberately using a weird number for vector length - Kokkos::parallel_for(team_policy(1, Kokkos::AUTO(), 19), + Kokkos::parallel_for(range_policy(0, k), TestSerialRadix2Functor(keys, keysAux, data, dataAux, counts, offsets)); //Sort using std::sort on host to do correctness test Kokkos::View gold("Host sorted", n); Kokkos::deep_copy(gold, keys); for(size_t i = 0; i < k; i++) { - Key* begin = &gold(offsetsHost(i)); + Key* begin = gold.data() + offsetsHost(i); Key* end = begin + countsHost(i); std::sort(begin, end); } @@ -319,7 +308,7 @@ struct TestTeamBitonicFunctor KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::Impl::TeamBitonicSort(&values(offsets(i)), counts(i), t); + KokkosKernels::Impl::TeamBitonicSort(values.data() + offsets(i), counts(i), t); } ValView values; @@ -341,7 +330,7 @@ struct TestTeamBitonic2Functor KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::Impl::TeamBitonicSort2(&keys(offsets(i)), &values(offsets(i)), counts(i), t); + KokkosKernels::Impl::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), counts(i), t); } KeyView keys; @@ -357,14 +346,12 @@ void testTeamBitonicSort(size_t k, size_t subArraySize) typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View ValView; - OrdView counts; - OrdView offsets; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); //Generate k sub-array sizes, each with size about 20 size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - auto countsHost = Kokkos::create_mirror_view(counts); - auto offsetsHost = Kokkos::create_mirror_view(offsets); - Kokkos::deep_copy(countsHost, counts); - Kokkos::deep_copy(offsetsHost, offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); ValView data("Bitonic sort testing data", n); fillRandom(data); //Run the sorting on device in all sub-arrays in parallel @@ -378,7 +365,7 @@ void testTeamBitonicSort(size_t k, size_t subArraySize) Kokkos::deep_copy(gold, data); for(size_t i = 0; i < k; i++) { - Scalar* begin = &gold(offsetsHost(i)); + Scalar* begin = gold.data() + offsetsHost(i); Scalar* end = begin + countsHost(i); std::sort(begin, end); } @@ -396,14 +383,12 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; typedef Kokkos::View ValView; - OrdView counts; - OrdView offsets; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); //Generate k sub-array sizes, each with size about 20 size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); - Kokkos::deep_copy(countsHost, counts); - Kokkos::deep_copy(offsetsHost, offsets); KeyView keys("Bitonic test keys", n); ValView data("Bitonic test data", n); //The keys are randomized @@ -543,7 +528,7 @@ void testBitonicSortLexicographic() ASSERT_TRUE(ordered); } -TEST_F(TestCategory, serial_radix) { +TEST_F(TestCategory, common_serial_radix) { //Test serial radix over some contiguous small arrays //1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 100; @@ -551,13 +536,24 @@ TEST_F(TestCategory, serial_radix) { { testSerialRadixSort(numArrays, arrayMax); testSerialRadixSort(numArrays, arrayMax); + } +} + +TEST_F(TestCategory, common_serial_radix2) { + typedef TestExecSpace es; + std::cout << "Test exec space is: " << typeid(es).name() << "\n"; + //Test serial radix over some contiguous small arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 100; + for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) + { testSerialRadixSort2(numArrays, arrayMax); testSerialRadixSort2(numArrays, arrayMax); testSerialRadixSort2>(numArrays, arrayMax); } } -TEST_F(TestCategory, test_bitonic) { +TEST_F(TestCategory, common_team_bitonic) { //Test team-level bitonic over some contiguous medium arrays //1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 20; @@ -565,13 +561,22 @@ TEST_F(TestCategory, test_bitonic) { { testTeamBitonicSort(numArrays, arrayMax); testTeamBitonicSort(numArrays, arrayMax); + } +} + +TEST_F(TestCategory, common_team_bitonic2) { + //Test team-level bitonic over some contiguous medium arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 20; + for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) + { testTeamBitonicSort2(numArrays, arrayMax); testTeamBitonicSort2(numArrays, arrayMax); testTeamBitonicSort2>(numArrays, arrayMax); } } -TEST_F( TestCategory, device_level_bitonic) { +TEST_F( TestCategory, common_device_bitonic) { //Test device-level bitonic with some larger arrays testBitonicSort(243743); testBitonicSort(2157);