From 446fc0b7cb8ad28c45a7533087b78991fed0d33c Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 17 Dec 2019 10:57:29 -0700 Subject: [PATCH 1/6] Fix major bug in TeamBitonicSort2. Add Test_DEVICE_Common_Sorting.o to unit test executables in Makefile-based build. I think this is why the bug wasn't caught before. --- src/common/KokkosKernels_Sorting.hpp | 10 +- test_common/Test_Common_Sorting.hpp | 278 ++++++++++++++++++++++--- unit_test/Makefile | 4 + unit_test/sparse/Test_Sparse_spadd.hpp | 6 +- 4 files changed, 256 insertions(+), 42 deletions(-) diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index 40ad174689..edb6b64498 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -395,11 +395,11 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember if(elem2 < n) { //both elements in bounds, so compare them and swap if out of order - if(comp(values[elem2], values[elem2])) + if(comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; + ValueType temp1 = values[elem1]; values[elem1] = values[elem2]; - values[elem2] = temp; + values[elem2] = temp1; PermType temp2 = perm[elem1]; perm[elem1] = perm[elem2]; perm[elem2] = temp2; @@ -414,9 +414,9 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember { if(comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; + ValueType temp1 = values[elem1]; values[elem1] = values[elem2]; - values[elem2] = temp; + values[elem2] = temp1; PermType temp2 = perm[elem1]; perm[elem1] = perm[elem2]; perm[elem2] = temp2; diff --git a/test_common/Test_Common_Sorting.hpp b/test_common/Test_Common_Sorting.hpp index a066882144..3257eb4380 100644 --- a/test_common/Test_Common_Sorting.hpp +++ b/test_common/Test_Common_Sorting.hpp @@ -53,6 +53,7 @@ #include #include #include +#include #include //Generate n randomized counts with mean . @@ -61,13 +62,17 @@ template size_t generateRandomOffsets(OrdView& randomCounts, OrdView& randomOffsets, size_t n, size_t avg) { + srand(54321); randomCounts = OrdView("Counts", n); randomOffsets = OrdView("Offsets", n); auto countsHost = Kokkos::create_mirror_view(randomCounts); size_t total = 0; for(size_t i = 0; i < n; i++) { - countsHost(i) = 0.5 + rand() % (avg * 2); + if(avg == 0) + countsHost(i) = 0; + else + countsHost(i) = 0.5 + rand() % (avg * 2); total += countsHost(i); } Kokkos::deep_copy(randomCounts, countsHost); @@ -115,6 +120,25 @@ Coordinates getRandom() return Coordinates(getRandom(), getRandom(), getRandom()); } +//Specialize for Kokkos::complex, with the real and imaginary parts different +template +struct kvHash +{ + Value operator()(const Key& k) + { + return (Value) (3 * k + 4); + } +}; + +template +struct kvHash> +{ + Kokkos::complex operator()(const Key& k) + { + return Kokkos::complex(3 * k + 4, k - 10.4); + } +}; + template void fillRandom(View v) { @@ -126,6 +150,23 @@ void fillRandom(View v) Kokkos::deep_copy(v, vhost); } +template +void fillRandom(KeyView k, ValView v) +{ + srand(23456); + typedef typename KeyView::value_type Key; + typedef typename ValView::value_type Value; + auto khost = Kokkos::create_mirror_view(k); + auto vhost = Kokkos::create_mirror_view(v); + for(size_t i = 0; i < v.extent(0); i++) + { + khost(i) = getRandom(); + vhost(i) = kvHash()(khost(i)); + } + Kokkos::deep_copy(k, khost); + Kokkos::deep_copy(v, vhost); +} + template struct TestSerialRadixFunctor { @@ -149,46 +190,119 @@ struct TestSerialRadixFunctor OrdView offsets; }; -template -void testSerialRadixSort() +template +struct TestSerialRadix2Functor +{ + //Sort by keys, while permuting values + typedef typename KeyView::value_type Key; + typedef typename ValView::value_type Value; + + TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_) + : keys(keys_), keysAux(keysAux_), values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_) + {} + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + { + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, counts.extent(0)), + [=](const int i) + { + KokkosKernels::Impl::SerialRadixSort2(&keys(offsets(i)), &keysAux(offsets(i)), &values(offsets(i)), &valuesAux(offsets(i)), counts(i)); + }); + } + KeyView keys; + KeyView keysAux; + ValView values; + ValView valuesAux; + OrdView counts; + OrdView offsets; +}; + +template +void testSerialRadixSort(size_t k, size_t subArraySize) { //Create a view of randomized data typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View OrdView; - //typedef Kokkos::View OrdViewHost; - typedef Kokkos::View ValView; + typedef Kokkos::View KeyView; OrdView counts; OrdView offsets; //Generate k sub-array sizes, each with size about 20 - size_t k = 100; - size_t subSize = 20; - size_t n = generateRandomOffsets(counts, offsets, k, subSize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); auto countsHost = Kokkos::create_mirror_view(counts); auto offsetsHost = Kokkos::create_mirror_view(offsets); Kokkos::deep_copy(countsHost, counts); Kokkos::deep_copy(offsetsHost, offsets); - ValView data("Radix sort testing data", n); - fillRandom(data); - ValView dataAux("Radix sort aux data", n); + KeyView keys("Radix sort testing data", n); + fillRandom(keys); + //Sort using std::sort on host to do correctness test + Kokkos::View gold("Host sorted", n); + Kokkos::deep_copy(gold, keys); + for(size_t i = 0; i < k; i++) + { + Key* begin = gold.data() + offsetsHost(i); + Key* end = begin + countsHost(i); + std::sort(begin, end); + } + KeyView keysAux("Radix sort aux data", n); //Run the sorting on device in all sub-arrays in parallel, just using vector loops typedef Kokkos::TeamPolicy team_policy; Kokkos::parallel_for(team_policy(1, Kokkos::AUTO(), 32), - TestSerialRadixFunctor(data, dataAux, counts, offsets)); + TestSerialRadixFunctor(keys, keysAux, counts, offsets)); + //Copy result to host + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + for(size_t i = 0; i < n; i++) + { + ASSERT_EQ(keysHost(i), gold(i)); + } +} + +template +void testSerialRadixSort2(size_t k, size_t subArraySize) +{ + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View OrdView; + typedef Kokkos::View KeyView; + typedef Kokkos::View ValView; + OrdView counts; + OrdView offsets; + //Generate k sub-array sizes, each with size about 20 + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view(Kokkos::HostSpace(), offsets); + KeyView keys("Radix test keys", n); + ValView data("Radix test data", n); + //The keys are randomized + fillRandom(keys, data); + KeyView keysAux("Radix sort aux keys", n); + ValView dataAux("Radix sort aux data", n); + //Run the sorting on device in all sub-arrays in parallel, just using vector loops + typedef Kokkos::TeamPolicy team_policy; + //Deliberately using a weird number for vector length + Kokkos::parallel_for(team_policy(1, Kokkos::AUTO(), 19), + TestSerialRadix2Functor(keys, keysAux, data, dataAux, counts, offsets)); //Sort using std::sort on host to do correctness test - Kokkos::View gold("Host sorted", n); - Kokkos::deep_copy(gold, data); + Kokkos::View gold("Host sorted", n); + Kokkos::deep_copy(gold, keys); for(size_t i = 0; i < k; i++) { - Scalar* begin = &gold(offsetsHost(i)); - Scalar* end = begin + countsHost(i); + Key* begin = &gold(offsetsHost(i)); + Key* end = begin + countsHost(i); std::sort(begin, end); } - //Copy result to host - auto dataHost = Kokkos::create_mirror_view(data); - Kokkos::deep_copy(dataHost, data); + //Copy results to host + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); + //Make sure keys are sorted exactly (stability of sort doesn't matter) for(size_t i = 0; i < n; i++) { - ASSERT_EQ(dataHost(i), gold(i)); + ASSERT_EQ(keysHost(i), gold(i)); + } + //Make sure the hashes of each key still matches the corresponding value + for(size_t i = 0; i < n; i++) + { + auto correctHash = kvHash()(keysHost(i)); + ASSERT_EQ(dataHost(i), correctHash); } } @@ -213,20 +327,40 @@ struct TestTeamBitonicFunctor OrdView offsets; }; +template +struct TestTeamBitonic2Functor +{ + typedef typename KeyView::value_type Key; + typedef typename ValView::value_type Value; + + TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_) + : keys(keys_), values(values_), counts(counts_), offsets(offsets_) + {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + { + int i = t.league_rank(); + KokkosKernels::Impl::TeamBitonicSort2(&keys(offsets(i)), &values(offsets(i)), counts(i), t); + } + + KeyView keys; + ValView values; + OrdView counts; + OrdView offsets; +}; + template -void testTeamBitonicSort() +void testTeamBitonicSort(size_t k, size_t subArraySize) { //Create a view of randomized data typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View OrdView; - //typedef Kokkos::View OrdViewHost; typedef Kokkos::View ValView; OrdView counts; OrdView offsets; //Generate k sub-array sizes, each with size about 20 - size_t k = 100; - size_t subSize = 100; - size_t n = generateRandomOffsets(counts, offsets, k, subSize); + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); auto countsHost = Kokkos::create_mirror_view(counts); auto offsetsHost = Kokkos::create_mirror_view(offsets); Kokkos::deep_copy(countsHost, counts); @@ -254,6 +388,55 @@ void testTeamBitonicSort() } } +template +void testTeamBitonicSort2(size_t k, size_t subArraySize) +{ + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View OrdView; + typedef Kokkos::View KeyView; + typedef Kokkos::View ValView; + OrdView counts; + OrdView offsets; + //Generate k sub-array sizes, each with size about 20 + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + Kokkos::deep_copy(countsHost, counts); + Kokkos::deep_copy(offsetsHost, offsets); + KeyView keys("Bitonic test keys", n); + ValView data("Bitonic test data", n); + //The keys are randomized + fillRandom(keys, data); + //Run the sorting on device in all sub-arrays in parallel, just using vector loops + //Deliberately using a weird number for vector length + Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), + TestTeamBitonic2Functor(keys, data, counts, offsets)); + //Sort using std::sort on host to do correctness test + Kokkos::View gold("Host sorted", n); + Kokkos::deep_copy(gold, keys); + for(size_t i = 0; i < k; i++) + { + Key* begin = gold.data() + offsetsHost(i); + Key* end = begin + countsHost(i); + std::sort(begin, end); + } + //Copy results to host + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); + //Make sure keys are sorted exactly (stability of sort doesn't matter) + for(size_t i = 0; i < n; i++) + { + ASSERT_EQ(keysHost(i), gold(i)); + } + //Make sure the hashes of each key still matches the corresponding value + for(size_t i = 0; i < n; i++) + { + auto correctHash = kvHash()(keysHost(i)); + ASSERT_EQ(dataHost(i), correctHash); + } +} + template struct CheckSortedFunctor { @@ -360,18 +543,47 @@ void testBitonicSortLexicographic() ASSERT_TRUE(ordered); } -TEST_F( TestCategory, common_Sorting) { - testSerialRadixSort(); - testSerialRadixSort(); - testTeamBitonicSort(); - testTeamBitonicSort(); - testBitonicSort(215673); - testBitonicSort(92314); - testBitonicSort(60234); +TEST_F(TestCategory, serial_radix) { + //Test serial radix over some contiguous small arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 100; + for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) + { + testSerialRadixSort(numArrays, arrayMax); + testSerialRadixSort(numArrays, arrayMax); + testSerialRadixSort2(numArrays, arrayMax); + testSerialRadixSort2(numArrays, arrayMax); + testSerialRadixSort2>(numArrays, arrayMax); + } +} + +TEST_F(TestCategory, test_bitonic) { + //Test team-level bitonic over some contiguous medium arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 20; + for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) + { + testTeamBitonicSort(numArrays, arrayMax); + testTeamBitonicSort(numArrays, arrayMax); + testTeamBitonicSort2(numArrays, arrayMax); + testTeamBitonicSort2(numArrays, arrayMax); + testTeamBitonicSort2>(numArrays, arrayMax); + } +} + +TEST_F( TestCategory, device_level_bitonic) { + //Test device-level bitonic with some larger arrays + testBitonicSort(243743); + testBitonicSort(2157); testBitonicSort(424); + testBitonicSort(5); + testBitonicSort(92314); testBitonicSort(123); + testBitonicSort(60234); testBitonicSort(53); + //Test custom comparator: ">" instead of "<" to sort descending testBitonicSortDescending(); + //Test custom comparator: lexicographic comparison of 3-element struct testBitonicSortLexicographic(); } diff --git a/unit_test/Makefile b/unit_test/Makefile index a0832edd2b..98c9e2d57c 100644 --- a/unit_test/Makefile +++ b/unit_test/Makefile @@ -131,6 +131,7 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_OPENMP), 1) OBJ_OPENMP += Test_OpenMP_Graph_graph_color_d2.o OBJ_OPENMP += Test_OpenMP_Common_ArithTraits.o OBJ_OPENMP += Test_OpenMP_Common_set_bit_count.o + OBJ_OPENMP += Test_OpenMP_Common_Sorting.o # OBJ_OPENMP += Test_OpenMP_Common_float128.o # Real OBJ_OPENMP += Test_OpenMP_Batched_SerialMatUtil_Real.o @@ -247,6 +248,7 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_CUDA), 1) OBJ_CUDA += Test_Cuda_Graph_graph_color_d2.o OBJ_CUDA += Test_Cuda_Common_ArithTraits.o OBJ_CUDA += Test_Cuda_Common_set_bit_count.o + OBJ_CUDA += Test_Cuda_Common_Sorting.o # Real OBJ_CUDA += Test_Cuda_Batched_SerialMatUtil_Real.o OBJ_CUDA += Test_Cuda_Batched_SerialGemm_Real.o @@ -357,6 +359,7 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_SERIAL), 1) OBJ_SERIAL += Test_Serial_Common_ArithTraits.o OBJ_SERIAL += Test_Serial_Common_set_bit_count.o # OBJ_SERIAL += Test_Serial_Common_float128.o + OBJ_SERIAL += Test_Serial_Common_Sorting.o # Real OBJ_SERIAL += Test_Serial_Batched_SerialMatUtil_Real.o OBJ_SERIAL += Test_Serial_Batched_SerialGemm_Real.o @@ -472,6 +475,7 @@ ifeq ($(KOKKOSKERNELS_INTERNAL_TEST_THREADS), 1) OBJ_THREADS += Test_Threads_Graph_graph_color_d2.o OBJ_THREADS += Test_Threads_Common_ArithTraits.o OBJ_THREADS += Test_Threads_Common_set_bit_count.o + OBJ_THREADS += Test_Threads_Common_Sorting.o # OBJ_THREADS += Test_Threads_Common_float128.o TARGETS += KokkosKernels_UnitTest_Threads TEST_TARGETS += test-threads diff --git a/unit_test/sparse/Test_Sparse_spadd.hpp b/unit_test/sparse/Test_Sparse_spadd.hpp index 648db5dc1f..9e1d0865f2 100644 --- a/unit_test/sparse/Test_Sparse_spadd.hpp +++ b/unit_test/sparse/Test_Sparse_spadd.hpp @@ -13,10 +13,8 @@ #include //for rand #include //for std::is_same -#ifndef kokkos_complex_double -#define kokkos_complex_double Kokkos::complex -#define kokkos_complex_float Kokkos::complex -#endif +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; namespace Test { From 4a09615258de1139f084e171d5b6d7a3b4e1aac9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 17 Dec 2019 11:31:17 -0700 Subject: [PATCH 2/6] Fixed spadd test --- unit_test/sparse/Test_Sparse_spadd.hpp | 114 ++++++++++++------------- 1 file changed, 53 insertions(+), 61 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_spadd.hpp b/unit_test/sparse/Test_Sparse_spadd.hpp index 9e1d0865f2..04d18a7465 100644 --- a/unit_test/sparse/Test_Sparse_spadd.hpp +++ b/unit_test/sparse/Test_Sparse_spadd.hpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -16,8 +17,6 @@ typedef Kokkos::complex kokkos_complex_double; typedef Kokkos::complex kokkos_complex_float; -namespace Test { - //Create a random square matrix for testing mat-mat addition kernels template crsMat_t randomMatrix(ordinal_type nrows, ordinal_type minNNZ, ordinal_type maxNNZ, bool sortRows) @@ -77,59 +76,6 @@ crsMat_t randomMatrix(ordinal_type nrows, ordinal_type minNNZ, ordinal_type maxN return crsMat_t("test matrix", nrows, nrows, nnz, values, rowmap, entries); } -template -void checkSumRowCorrect(ordinal_type row, crsMat_t A, crsMat_t B, crsMat_t C) -{ - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type size_type_view_t; - typedef typename graph_t::entries_type lno_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename size_type_view_t::non_const_value_type size_type; //rowptr type - typedef typename lno_view_t::non_const_value_type lno_t; //colind type - static_assert(std::is_same::value, "ordinal_type should be same as lno_t from crsMat_t"); - typedef typename scalar_view_t::non_const_value_type scalar_t; //value type - auto Avalues = Kokkos::create_mirror_view(A.values); - auto Arowmap = Kokkos::create_mirror_view(A.graph.row_map); - auto Aentries = Kokkos::create_mirror_view(A.graph.entries); - auto Bvalues = Kokkos::create_mirror_view(B.values); - auto Browmap = Kokkos::create_mirror_view(B.graph.row_map); - auto Bentries = Kokkos::create_mirror_view(B.graph.entries); - auto Cvalues = Kokkos::create_mirror_view(C.values); - auto Crowmap = Kokkos::create_mirror_view(C.graph.row_map); - auto Centries = Kokkos::create_mirror_view(C.graph.entries); - lno_t nrows = Arowmap.extent(0) - 1; - //compute the correct row as a dense vector - std::vector correct(nrows, 0); - std::vector nonzeros(nrows, false); - for(size_type i = Arowmap(row); i < Arowmap(row + 1); i++) - { - correct[Aentries(i)] += Avalues(i); - nonzeros[Aentries(i)] = true; - } - for(size_type i = Browmap(row); i < Browmap(row + 1); i++) - { - correct[Bentries(i)] += Bvalues(i); - nonzeros[Bentries(i)] = true; - } - size_type nz = 0; - for(lno_t i = 0; i < nrows; i++) - { - if(nonzeros[i]) - nz++; - } - //make sure C has the right number of entries - auto actualNZ = Crowmap(row + 1) - Crowmap(row); - ASSERT_EQ(actualNZ, nz) << "A+B row " << row << " has " << actualNZ << " entries but should have " << nz; - //make sure C has the correct values - for(size_type i = Crowmap(row); i < Crowmap(row + 1); i++) - { - scalar_t Cval = Cvalues(i); - lno_t Ccol = Centries(i); - EXPECT_EQ(correct[Ccol], Cval) << "A+B row " << row << ", column " << Ccol << " has value " << Cval << " but should be " << correct[Ccol]; - } -} -} - template void test_spadd(lno_t numRows, size_type minNNZ, size_type maxNNZ, bool sortRows) { @@ -144,8 +90,10 @@ void test_spadd(lno_t numRows, size_type minNNZ, size_type maxNNZ, bool sortRows KernelHandle handle; handle.create_spadd_handle(sortRows); - crsMat_t A = Test::randomMatrix(numRows, minNNZ, maxNNZ, sortRows); - crsMat_t B = Test::randomMatrix(numRows, minNNZ, maxNNZ, sortRows); + crsMat_t A = randomMatrix(numRows, minNNZ, maxNNZ, sortRows); + crsMat_t B = randomMatrix(numRows, minNNZ, maxNNZ, sortRows); + //Matrices from randomMatrix are always square + lno_t numCols = numRows; row_map_type c_row_map("C row map", numRows + 1); auto addHandle = handle.get_spadd_handle(); KokkosSparse::Experimental::spadd_symbolic< @@ -175,11 +123,55 @@ void test_spadd(lno_t numRows, size_type minNNZ, size_type maxNNZ, bool sortRows //create C using CRS arrays crsMat_t C("C", numRows, numRows, addHandle->get_max_result_nnz(), c_values, c_row_map, c_entries); handle.destroy_spadd_handle(); - - //check that C is correct - for(lno_t i = 0; i < numRows; i++) + auto Avalues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + auto Arowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto Aentries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto Bvalues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.values); + auto Browmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.graph.row_map); + auto Bentries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B.graph.entries); + auto Cvalues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.values); + auto Crowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.graph.row_map); + auto Centries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), C.graph.entries); + using KAT = Kokkos::ArithTraits; + auto zero = KAT::zero(); + auto eps = KAT::epsilon(); + //check that C is correct and sorted, row-by-row + for(lno_t row = 0; row < numRows; row++) { - Test::checkSumRowCorrect(i, A, B, C); + std::vector correct(numCols, zero); + std::vector nonzeros(numCols, false); + for(size_type i = Arowmap(row); i < Arowmap(row + 1); i++) + { + correct[Aentries(i)] += Avalues(i); + nonzeros[Aentries(i)] = true; + } + for(size_type i = Browmap(row); i < Browmap(row + 1); i++) + { + correct[Bentries(i)] += Bvalues(i); + nonzeros[Bentries(i)] = true; + } + size_type nz = 0; + for(lno_t i = 0; i < numCols; i++) + { + if(nonzeros[i]) + nz++; + } + //make sure C has the right number of entries + auto actualNZ = Crowmap(row + 1) - Crowmap(row); + ASSERT_EQ(actualNZ, nz) << "A+B row " << row << " has " << actualNZ << " entries but should have " << nz; + //make sure C's indices are sorted + for(size_type i = Crowmap(row) + 1; i < Crowmap(row + 1); i++) + { + ASSERT_LE(Centries(i - 1), Centries(i)) << "C row " << row << " is not sorted"; + } + //make sure C has the correct values + for(size_type i = Crowmap(row); i < Crowmap(row + 1); i++) + { + scalar_t Cval = Cvalues(i); + lno_t Ccol = Centries(i); + //Check that result is correct to 1 ULP + ASSERT_LE(KAT::abs(correct[Ccol] - Cval), KAT::abs(correct[Ccol] * eps)) << "A+B row " << row << ", column " << Ccol << " has value " << Cval << " but should be " << correct[Ccol]; + } } } From c5ea8b701eac41fe681d3f92fae9d7794c7fcb56 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 17 Dec 2019 12:25:42 -0700 Subject: [PATCH 3/6] Use smaller matrices in rank-1 gs tests Pthreads sparse unit test was timing out (1500 seconds) on bowman with double and complex as scalars. --- unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index 19aa25dfce..8ab16b71e7 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -528,13 +528,13 @@ void test_balloon_clustering(lno_t numRows, size_type nnzPerRow, lno_t bandwidth #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_asymmetric_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ - test_gauss_seidel_rank1(10000, 10000 * 30, 200, 10, false); \ + test_gauss_seidel_rank1(5000, 5000 * 20, 200, 10, false); \ } \ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_asymmetric_rank2 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_gauss_seidel_rank2(5000, 5000 * 20, 200, 10, 3, false); \ } \ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ - test_gauss_seidel_rank1(10000, 10000 * 30, 200, 10, true); \ + test_gauss_seidel_rank1(5000, 5000 * 20, 200, 10, true); \ } \ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank2 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_gauss_seidel_rank2(5000, 5000 * 20, 200, 10, 3, true); \ From da08d6fd23d27186eb456c98bfda427580788d86 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 17 Dec 2019 12:41:30 -0700 Subject: [PATCH 4/6] Fix warning, work around GCC 7.4 bug (#543) - sorting tests: Fix "calling host fn from host/device fn" warning - gemm: Work around compiler bug in GCC 7.4 + CUDA --- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 24 ++++++++++++------------ test_common/Test_Common_Sorting.hpp | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp index da8a6a6de6..e0d886fe5f 100644 --- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -99,11 +99,11 @@ struct impl_deep_copy_matrix_block Date: Tue, 17 Dec 2019 15:48:16 -0700 Subject: [PATCH 5/6] Fixed sorting tests call RadixSort in a RangePolicy, since it no longer uses ThreadVectorRange loops internally --- src/common/KokkosKernels_Sorting.hpp | 4 +- test_common/Test_Common_Sorting.hpp | 100 ++++++++++++++------------- 2 files changed, 53 insertions(+), 51 deletions(-) diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index edb6b64498..dd4929758d 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -52,7 +52,7 @@ namespace KokkosKernels { namespace Impl { //Radix sort for integers, on a single thread within a team. -//Pros: few diverging branches, so OK for sorting on a single GPU thread/warp. Better on CPU cores. +//Pros: few diverging branches, so OK for sorting on a single GPU vector lane. Better on CPU cores. //Con: requires auxiliary storage, and this version only works for integers template KOKKOS_INLINE_FUNCTION void @@ -166,7 +166,7 @@ SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) //Radix sort for integers (no internal parallelism). //While sorting, also permute "perm" array along with the values. -//Pros: few diverging branches, so good for sorting on a single GPU thread/warp. +//Pros: few diverging branches, so good for sorting on a single GPU vector lane. //Con: requires auxiliary storage, this version only works for integers (although float/double is possible) template KOKKOS_INLINE_FUNCTION void diff --git a/test_common/Test_Common_Sorting.hpp b/test_common/Test_Common_Sorting.hpp index dc9ea16d5a..800e5d482c 100644 --- a/test_common/Test_Common_Sorting.hpp +++ b/test_common/Test_Common_Sorting.hpp @@ -60,11 +60,9 @@ //Then prefix-sum into randomOffsets. //This simulates a CRS rowmap or other batched sorting scenario template -size_t generateRandomOffsets(OrdView& randomCounts, OrdView& randomOffsets, size_t n, size_t avg) +size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, size_t n, size_t avg) { srand(54321); - randomCounts = OrdView("Counts", n); - randomOffsets = OrdView("Offsets", n); auto countsHost = Kokkos::create_mirror_view(randomCounts); size_t total = 0; for(size_t i = 0; i < n; i++) @@ -175,14 +173,9 @@ struct TestSerialRadixFunctor TestSerialRadixFunctor(ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_) : values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_) {} - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, counts.extent(0)), - [=](const int i) - { - KokkosKernels::Impl::SerialRadixSort(&values(offsets(i)), &valuesAux(offsets(i)), counts(i)); - }); + KokkosKernels::Impl::SerialRadixSort(values.data() + offsets(i), valuesAux.data() + offsets(i), counts(i)); } ValView values; ValView valuesAux; @@ -200,14 +193,10 @@ struct TestSerialRadix2Functor TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_) : keys(keys_), keysAux(keysAux_), values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_) {} - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, counts.extent(0)), - [=](const int i) - { - KokkosKernels::Impl::SerialRadixSort2(&keys(offsets(i)), &keysAux(offsets(i)), &values(offsets(i)), &valuesAux(offsets(i)), counts(i)); - }); + int off = offsets(i); + KokkosKernels::Impl::SerialRadixSort2(keys.data() + off, keysAux.data() + off, values.data() + off, valuesAux.data() + off, counts(i)); } KeyView keys; KeyView keysAux; @@ -224,14 +213,12 @@ void testSerialRadixSort(size_t k, size_t subArraySize) typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; - OrdView counts; - OrdView offsets; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); //Generate k sub-array sizes, each with size about 20 size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - auto countsHost = Kokkos::create_mirror_view(counts); - auto offsetsHost = Kokkos::create_mirror_view(offsets); - Kokkos::deep_copy(countsHost, counts); - Kokkos::deep_copy(offsetsHost, offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); KeyView keys("Radix sort testing data", n); fillRandom(keys); //Sort using std::sort on host to do correctness test @@ -244,9 +231,9 @@ void testSerialRadixSort(size_t k, size_t subArraySize) std::sort(begin, end); } KeyView keysAux("Radix sort aux data", n); - //Run the sorting on device in all sub-arrays in parallel, just using vector loops - typedef Kokkos::TeamPolicy team_policy; - Kokkos::parallel_for(team_policy(1, Kokkos::AUTO(), 32), + //Run the sorting on device in all sub-arrays in parallel + typedef Kokkos::RangePolicy range_policy; + Kokkos::parallel_for(range_policy(0, k), TestSerialRadixFunctor(keys, keysAux, counts, offsets)); //Copy result to host auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); @@ -264,29 +251,29 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; typedef Kokkos::View ValView; - OrdView counts; - OrdView offsets; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); //Generate k sub-array sizes, each with size about 20 size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = Kokkos::create_mirror_view(Kokkos::HostSpace(), offsets); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); KeyView keys("Radix test keys", n); ValView data("Radix test data", n); //The keys are randomized fillRandom(keys, data); KeyView keysAux("Radix sort aux keys", n); ValView dataAux("Radix sort aux data", n); - //Run the sorting on device in all sub-arrays in parallel, just using vector loops - typedef Kokkos::TeamPolicy team_policy; + //Run the sorting on device in all sub-arrays in parallel + typedef Kokkos::RangePolicy range_policy; //Deliberately using a weird number for vector length - Kokkos::parallel_for(team_policy(1, Kokkos::AUTO(), 19), + Kokkos::parallel_for(range_policy(0, k), TestSerialRadix2Functor(keys, keysAux, data, dataAux, counts, offsets)); //Sort using std::sort on host to do correctness test Kokkos::View gold("Host sorted", n); Kokkos::deep_copy(gold, keys); for(size_t i = 0; i < k; i++) { - Key* begin = &gold(offsetsHost(i)); + Key* begin = gold.data() + offsetsHost(i); Key* end = begin + countsHost(i); std::sort(begin, end); } @@ -319,7 +306,7 @@ struct TestTeamBitonicFunctor KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::Impl::TeamBitonicSort(&values(offsets(i)), counts(i), t); + KokkosKernels::Impl::TeamBitonicSort(values.data() + offsets(i), counts(i), t); } ValView values; @@ -341,7 +328,7 @@ struct TestTeamBitonic2Functor KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::Impl::TeamBitonicSort2(&keys(offsets(i)), &values(offsets(i)), counts(i), t); + KokkosKernels::Impl::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), counts(i), t); } KeyView keys; @@ -357,14 +344,12 @@ void testTeamBitonicSort(size_t k, size_t subArraySize) typedef typename ExecSpace::memory_space mem_space; typedef Kokkos::View OrdView; typedef Kokkos::View ValView; - OrdView counts; - OrdView offsets; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); //Generate k sub-array sizes, each with size about 20 size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - auto countsHost = Kokkos::create_mirror_view(counts); - auto offsetsHost = Kokkos::create_mirror_view(offsets); - Kokkos::deep_copy(countsHost, counts); - Kokkos::deep_copy(offsetsHost, offsets); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); ValView data("Bitonic sort testing data", n); fillRandom(data); //Run the sorting on device in all sub-arrays in parallel @@ -378,7 +363,7 @@ void testTeamBitonicSort(size_t k, size_t subArraySize) Kokkos::deep_copy(gold, data); for(size_t i = 0; i < k; i++) { - Scalar* begin = &gold(offsetsHost(i)); + Scalar* begin = gold.data() + offsetsHost(i); Scalar* end = begin + countsHost(i); std::sort(begin, end); } @@ -396,14 +381,12 @@ void testTeamBitonicSort2(size_t k, size_t subArraySize) typedef Kokkos::View OrdView; typedef Kokkos::View KeyView; typedef Kokkos::View ValView; - OrdView counts; - OrdView offsets; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); //Generate k sub-array sizes, each with size about 20 size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); - Kokkos::deep_copy(countsHost, counts); - Kokkos::deep_copy(offsetsHost, offsets); KeyView keys("Bitonic test keys", n); ValView data("Bitonic test data", n); //The keys are randomized @@ -543,7 +526,7 @@ void testBitonicSortLexicographic() ASSERT_TRUE(ordered); } -TEST_F(TestCategory, serial_radix) { +TEST_F(TestCategory, common_serial_radix) { //Test serial radix over some contiguous small arrays //1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 100; @@ -551,13 +534,23 @@ TEST_F(TestCategory, serial_radix) { { testSerialRadixSort(numArrays, arrayMax); testSerialRadixSort(numArrays, arrayMax); + } +} + +TEST_F(TestCategory, common_serial_radix2) { + typedef TestExecSpace es; + //Test serial radix over some contiguous small arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 100; + for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) + { testSerialRadixSort2(numArrays, arrayMax); testSerialRadixSort2(numArrays, arrayMax); testSerialRadixSort2>(numArrays, arrayMax); } } -TEST_F(TestCategory, test_bitonic) { +TEST_F(TestCategory, common_team_bitonic) { //Test team-level bitonic over some contiguous medium arrays //1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 20; @@ -565,13 +558,22 @@ TEST_F(TestCategory, test_bitonic) { { testTeamBitonicSort(numArrays, arrayMax); testTeamBitonicSort(numArrays, arrayMax); + } +} + +TEST_F(TestCategory, common_team_bitonic2) { + //Test team-level bitonic over some contiguous medium arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 20; + for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) + { testTeamBitonicSort2(numArrays, arrayMax); testTeamBitonicSort2(numArrays, arrayMax); testTeamBitonicSort2>(numArrays, arrayMax); } } -TEST_F( TestCategory, device_level_bitonic) { +TEST_F( TestCategory, common_device_bitonic) { //Test device-level bitonic with some larger arrays testBitonicSort(243743); testBitonicSort(2157); From 31318de89a0caf7b857f1d15b5802a2561540f48 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 17 Dec 2019 15:56:41 -0700 Subject: [PATCH 6/6] Fixed unused typedef warning/error --- test_common/Test_Common_Sorting.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test_common/Test_Common_Sorting.hpp b/test_common/Test_Common_Sorting.hpp index 800e5d482c..65350870f0 100644 --- a/test_common/Test_Common_Sorting.hpp +++ b/test_common/Test_Common_Sorting.hpp @@ -538,7 +538,6 @@ TEST_F(TestCategory, common_serial_radix) { } TEST_F(TestCategory, common_serial_radix2) { - typedef TestExecSpace es; //Test serial radix over some contiguous small arrays //1st arg is #arrays, 2nd arg is max subarray size size_t numArrays = 100;