From 0180f4b5e792af8eb043c6b9a0aa2d864331213f Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Tue, 24 Nov 2020 16:11:50 -0500 Subject: [PATCH 1/3] FIX update-version.sh --- ci/release/update-version.sh | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 30db4d43e42..0d7f0c64292 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -48,15 +48,4 @@ function sed_runner() { } sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt -# RTD update -#sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py -#sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py - -for FILE in conda/environments/*.yml; do - sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/dask-cudf=${CURRENT_SHORT_TAG}/dask-cudf=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/ucx-py=${CURRENT_SHORT_TAG}/ucx-py=${NEXT_SHORT_TAG}/g" ${FILE}; - sed_runner "s/libcumlprims=${CURRENT_SHORT_TAG}/libcumlprims=${NEXT_SHORT_TAG}/g" ${FILE}; -done + From e5020dfebb7d5503ade031d7e11dcaef80013f9b Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Tue, 24 Nov 2020 16:13:37 -0500 Subject: [PATCH 2/3] DOC Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ab9956baab..71447e7891e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - PR #77: Fixing CUB include for CUDA < 11 - PR #86: Missing headers for newly moved prims - PR #102: Check alignment before binaryOp dispatch +- PR #104: Fix update-version.sh # RAFT 0.16.0 (Date TBD) From 7c107206f6afb686f4faaf2d282cc2c969e54a30 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Thu, 26 Nov 2020 19:48:28 +0100 Subject: [PATCH 3/3] [REVIEW] Allow generic reductions for the map then reduce op (#94) * ENH Allow generic reductions for the map then reduce op * DOC Update changelog * Allow different input and output type for MapThenReduce * Set accumulator and neutral type to OutType Co-authored-by: Dante Gama Dessavre --- CHANGELOG.md | 1 + cpp/include/raft/linalg/map_then_reduce.cuh | 83 ++++++++++---- cpp/test/linalg/map_then_reduce.cu | 114 ++++++++++++++++---- 3 files changed, 156 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71447e7891e..5e39361368f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - PR #65: Adding cuml prims that break circular dependency between cuml and cumlprims projects - PR #101: MST core solver - PR #93: Incorporate Date/Nagi implementation of Hungarian Algorithm +- PR #94: Allow generic reductions for the map then reduce op - RP #95: Cholesky rank one update prim ## Improvements diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh index 1a6513b915e..f2f198670aa 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -24,20 +24,34 @@ namespace raft { namespace linalg { -template -__device__ void reduce(Type *out, const Type acc) { - typedef cub::BlockReduce BlockReduce; +struct sum_tag {}; + +template +__device__ void reduce(OutType *out, const InType acc, sum_tag) { + typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - Type tmp = BlockReduce(temp_storage).Sum(acc); + OutType tmp = BlockReduce(temp_storage).Sum(acc); if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); } } -template -__global__ void mapThenSumReduceKernel(Type *out, size_t len, MapOp map, - const Type *in, Args... args) { - Type acc = (Type)0; +template +__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); + if (threadIdx.x == 0) { + raft::myAtomicReduce(out, tmp, op); + } +} + +template +__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral, + MapOp map, ReduceLambda op, + const InType *in, Args... args) { + OutType acc = neutral; auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); if (idx < len) { @@ -46,16 +60,18 @@ __global__ void mapThenSumReduceKernel(Type *out, size_t len, MapOp map, __syncthreads(); - reduce(out, acc); + reduce(out, acc, op); } -template -void mapThenSumReduceImpl(Type *out, size_t len, MapOp map, cudaStream_t stream, - const Type *in, Args... args) { - CUDA_CHECK(cudaMemsetAsync(out, 0, sizeof(Type), stream)); +template +void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, + ReduceLambda op, cudaStream_t stream, const InType *in, + Args... args) { + raft::update_device(out, &neutral, 1, stream); const int nblks = raft::ceildiv(len, (size_t)TPB); - mapThenSumReduceKernel - <<>>(out, len, map, in, args...); + mapThenReduceKernel + <<>>(out, len, neutral, map, op, in, args...); CUDA_CHECK(cudaPeekAtLastError()); } @@ -73,12 +89,39 @@ void mapThenSumReduceImpl(Type *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void mapThenSumReduce(Type *out, size_t len, MapOp map, cudaStream_t stream, - const Type *in, Args... args) { - mapThenSumReduceImpl(out, len, map, stream, in, - args...); +template +void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, + const InType *in, Args... args) { + mapThenReduceImpl( + out, len, (OutType)0, map, sum_tag(), stream, in, args...); } +/** + * @brief CUDA version of map and then generic reduction operation + * @tparam Type data-type upon which the math operation will be performed + * @tparam MapOp the device-lambda performing the actual map operation + * @tparam ReduceLambda the device-lambda performing the actual reduction + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Args additional parameters + * @param out the output reduced value (assumed to be a device pointer) + * @param len number of elements in the input array + * @param neutral The neutral element of the reduction operation. For example: + * 0 for sum, 1 for multiply, +Inf for Min, -Inf for Max + * @param map the device-lambda + * @param op the reduction device lambda + * @param stream cuda-stream where to launch this kernel + * @param in the input array + * @param args additional input arrays + */ + +template +void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map, + ReduceLambda op, cudaStream_t stream, const InType *in, + Args... args) { + mapThenReduceImpl( + out, len, neutral, map, op, stream, in, args...); +} }; // end namespace linalg }; // end namespace raft diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index adbb339de2d..6e146fa4bbd 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include "../test_utils.h" @@ -23,21 +24,21 @@ namespace raft { namespace linalg { -template -__global__ void naiveMapReduceKernel(Type *out, const Type *in, size_t len, +template +__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len, MapOp map) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < len) { - raft::myAtomicAdd(out, map(in[idx])); + raft::myAtomicAdd(out, (OutType)map(in[idx])); } } -template -void naiveMapReduce(Type *out, const Type *in, size_t len, MapOp map, +template +void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map, cudaStream_t stream) { static const int TPB = 64; int nblks = raft::ceildiv(len, (size_t)TPB); - naiveMapReduceKernel + naiveMapReduceKernel <<>>(out, in, len, map); CUDA_CHECK(cudaPeekAtLastError()); } @@ -57,19 +58,19 @@ template // Or else, we get the following compilation error // for an extended __device__ lambda cannot have private or protected access // within its class -template -void mapReduceLaunch(T *out_ref, T *out, const T *in, size_t len, - cudaStream_t stream) { - auto op = [] __device__(T in) { return in; }; +template +void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in, + size_t len, cudaStream_t stream) { + auto op = [] __device__(InType in) { return in; }; naiveMapReduce(out_ref, in, len, op, stream); mapThenSumReduce(out, len, op, 0, in); } -template -class MapReduceTest : public ::testing::TestWithParam> { +template +class MapReduceTest : public ::testing::TestWithParam> { protected: void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); auto len = params.len; @@ -78,7 +79,7 @@ class MapReduceTest : public ::testing::TestWithParam> { allocate(in, len); allocate(out_ref, len); allocate(out, len); - r.uniform(in, len, T(-1.0), T(1.0), stream); + r.uniform(in, len, InType(-1.0), InType(1.0), stream); mapReduceLaunch(out_ref, out, in, len, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } @@ -90,29 +91,98 @@ class MapReduceTest : public ::testing::TestWithParam> { } protected: - MapReduceInputs params; - T *in, *out_ref, *out; + MapReduceInputs params; + InType *in; + OutType *out_ref, *out; }; const std::vector> inputsf = { {0.001f, 1024 * 1024, 1234ULL}}; -typedef MapReduceTest MapReduceTestF; -TEST_P(MapReduceTestF, Result) { +typedef MapReduceTest MapReduceTestFF; +TEST_P(MapReduceTestFF, Result) { ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestF, +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, + ::testing::ValuesIn(inputsf)); + +typedef MapReduceTest MapReduceTestFD; +TEST_P(MapReduceTestFD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.000001, 1024 * 1024, 1234ULL}}; -typedef MapReduceTest MapReduceTestD; -TEST_P(MapReduceTestD, Result) { +typedef MapReduceTest MapReduceTestDD; +TEST_P(MapReduceTestDD, Result) { ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestD, +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd)); +template +class MapGenericReduceTest : public ::testing::Test { + using InType = typename T::first_type; + using OutType = typename T::second_type; + + protected: + MapGenericReduceTest() + : allocator(handle.get_device_allocator()), + input(allocator, handle.get_stream(), n), + output(allocator, handle.get_stream(), 1) { + CUDA_CHECK(cudaStreamCreate(&stream)); + handle.set_stream(stream); + initInput(input.data(), input.size(), stream); + } + + void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } + + public: + void initInput(InType *input, int n, cudaStream_t stream) { + raft::random::Rng r(137); + r.uniform(input, n, InType(2), InType(3), stream); + InType val = 1; + raft::update_device(input + 42, &val, 1, stream); + val = 5; + raft::update_device(input + 337, &val, 1, stream); + } + + void testMin() { + auto op = [] __device__(InType in) { return in; }; + const OutType neutral = std::numeric_limits::max(); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, + input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, + raft::Compare())); + } + void testMax() { + auto op = [] __device__(InType in) { return in; }; + const OutType neutral = std::numeric_limits::min(); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, + input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, + raft::Compare())); + } + + protected: + int n = 1237; + raft::handle_t handle; + cudaStream_t stream; + std::shared_ptr allocator; + raft::mr::device::buffer input; + raft::mr::device::buffer output; +}; + +using IoTypePair = + ::testing::Types, std::pair, + std::pair>; + +TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair); +TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); } +TYPED_TEST(MapGenericReduceTest, max) { this->testMax(); } } // end namespace linalg } // end namespace raft