From 0180f4b5e792af8eb043c6b9a0aa2d864331213f Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Tue, 24 Nov 2020 16:11:50 -0500
Subject: [PATCH 1/3] FIX update-version.sh

---
 ci/release/update-version.sh | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 30db4d43e42..0d7f0c64292 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -48,15 +48,4 @@ function sed_runner() {
 }
 
 sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
-# RTD update
-#sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
-#sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
-
-for FILE in conda/environments/*.yml; do
-   sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" ${FILE};
-   sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
-   sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
-   sed_runner "s/dask-cudf=${CURRENT_SHORT_TAG}/dask-cudf=${NEXT_SHORT_TAG}/g" ${FILE};
-   sed_runner "s/ucx-py=${CURRENT_SHORT_TAG}/ucx-py=${NEXT_SHORT_TAG}/g" ${FILE};
-   sed_runner "s/libcumlprims=${CURRENT_SHORT_TAG}/libcumlprims=${NEXT_SHORT_TAG}/g" ${FILE};
-done
+

From e5020dfebb7d5503ade031d7e11dcaef80013f9b Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Tue, 24 Nov 2020 16:13:37 -0500
Subject: [PATCH 2/3] DOC Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ab9956baab..71447e7891e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@
 - PR #77: Fixing CUB include for CUDA < 11
 - PR #86: Missing headers for newly moved prims
 - PR #102: Check alignment before binaryOp dispatch
+- PR #104: Fix update-version.sh
 
 # RAFT 0.16.0 (Date TBD)
 

From 7c107206f6afb686f4faaf2d282cc2c969e54a30 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 26 Nov 2020 19:48:28 +0100
Subject: [PATCH 3/3] [REVIEW] Allow generic reductions for the map then reduce
 op (#94)

* ENH Allow generic reductions for the map then reduce op

* DOC Update changelog

* Allow different input and output type for MapThenReduce

* Set accumulator and neutral type to OutType

Co-authored-by: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
---
 CHANGELOG.md                                |   1 +
 cpp/include/raft/linalg/map_then_reduce.cuh |  83 ++++++++++----
 cpp/test/linalg/map_then_reduce.cu          | 114 ++++++++++++++++----
 3 files changed, 156 insertions(+), 42 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71447e7891e..5e39361368f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 - PR #65: Adding cuml prims that break circular dependency between cuml and cumlprims projects
 - PR #101: MST core solver
 - PR #93: Incorporate Date/Nagi implementation of Hungarian Algorithm
+- PR #94: Allow generic reductions for the map then reduce op
 - RP #95: Cholesky rank one update prim
 
 ## Improvements
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
index 1a6513b915e..f2f198670aa 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -24,20 +24,34 @@
 namespace raft {
 namespace linalg {
 
-template <typename Type, int TPB>
-__device__ void reduce(Type *out, const Type acc) {
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
+struct sum_tag {};
+
+template <typename InType, typename OutType, int TPB>
+__device__ void reduce(OutType *out, const InType acc, sum_tag) {
+  typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type tmp = BlockReduce(temp_storage).Sum(acc);
+  OutType tmp = BlockReduce(temp_storage).Sum(acc);
   if (threadIdx.x == 0) {
     raft::myAtomicAdd(out, tmp);
   }
 }
 
-template <typename Type, typename MapOp, int TPB, typename... Args>
-__global__ void mapThenSumReduceKernel(Type *out, size_t len, MapOp map,
-                                       const Type *in, Args... args) {
-  Type acc = (Type)0;
+template <typename InType, typename OutType, int TPB, typename ReduceLambda>
+__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) {
+  typedef cub::BlockReduce<InType, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  OutType tmp = BlockReduce(temp_storage).Reduce(acc, op);
+  if (threadIdx.x == 0) {
+    raft::myAtomicReduce(out, tmp, op);
+  }
+}
+
+template <typename InType, typename OutType, typename MapOp,
+          typename ReduceLambda, int TPB, typename... Args>
+__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral,
+                                    MapOp map, ReduceLambda op,
+                                    const InType *in, Args... args) {
+  OutType acc = neutral;
   auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
   if (idx < len) {
@@ -46,16 +60,18 @@ __global__ void mapThenSumReduceKernel(Type *out, size_t len, MapOp map,
 
   __syncthreads();
 
-  reduce<Type, TPB>(out, acc);
+  reduce<InType, OutType, TPB>(out, acc, op);
 }
 
-template <typename Type, typename MapOp, int TPB, typename... Args>
-void mapThenSumReduceImpl(Type *out, size_t len, MapOp map, cudaStream_t stream,
-                          const Type *in, Args... args) {
-  CUDA_CHECK(cudaMemsetAsync(out, 0, sizeof(Type), stream));
+template <typename InType, typename OutType, typename MapOp,
+          typename ReduceLambda, int TPB, typename... Args>
+void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
+                       ReduceLambda op, cudaStream_t stream, const InType *in,
+                       Args... args) {
+  raft::update_device(out, &neutral, 1, stream);
   const int nblks = raft::ceildiv(len, (size_t)TPB);
-  mapThenSumReduceKernel<Type, MapOp, TPB, Args...>
-    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
+  mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
+    <<<nblks, TPB, 0, stream>>>(out, len, neutral, map, op, in, args...);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -73,12 +89,39 @@ void mapThenSumReduceImpl(Type *out, size_t len, MapOp map, cudaStream_t stream,
  * @param args additional input arrays
  */
 
-template <typename Type, typename MapOp, int TPB = 256, typename... Args>
-void mapThenSumReduce(Type *out, size_t len, MapOp map, cudaStream_t stream,
-                      const Type *in, Args... args) {
-  mapThenSumReduceImpl<Type, MapOp, TPB, Args...>(out, len, map, stream, in,
-                                                  args...);
+template <typename InType, typename MapOp, int TPB = 256, typename... Args,
+          typename OutType = InType>
+void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
+                      const InType *in, Args... args) {
+  mapThenReduceImpl<InType, OutType, MapOp, sum_tag, TPB, Args...>(
+    out, len, (OutType)0, map, sum_tag(), stream, in, args...);
 }
 
+/**
+ * @brief CUDA version of map and then generic reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual map operation
+ * @tparam ReduceLambda the device-lambda performing the actual reduction
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param neutral The neutral element of the reduction operation. For example:
+ *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
+ * @param map the device-lambda
+ * @param op the reduction device lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType, typename MapOp, typename ReduceLambda, int TPB = 256,
+          typename OutType = InType, typename... Args>
+void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map,
+                   ReduceLambda op, cudaStream_t stream, const InType *in,
+                   Args... args) {
+  mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
+    out, len, neutral, map, op, stream, in, args...);
+}
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index adbb339de2d..6e146fa4bbd 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
+#include <limits>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/random/rng.cuh>
 #include "../test_utils.h"
@@ -23,21 +24,21 @@
 namespace raft {
 namespace linalg {
 
-template <typename Type, typename MapOp>
-__global__ void naiveMapReduceKernel(Type *out, const Type *in, size_t len,
+template <typename InType, typename OutType, typename MapOp>
+__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len,
                                      MapOp map) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) {
-    raft::myAtomicAdd(out, map(in[idx]));
+    raft::myAtomicAdd(out, (OutType)map(in[idx]));
   }
 }
 
-template <typename Type, typename MapOp>
-void naiveMapReduce(Type *out, const Type *in, size_t len, MapOp map,
+template <typename InType, typename OutType, typename MapOp>
+void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map,
                     cudaStream_t stream) {
   static const int TPB = 64;
   int nblks = raft::ceildiv(len, (size_t)TPB);
-  naiveMapReduceKernel<Type, MapOp>
+  naiveMapReduceKernel<InType, OutType, MapOp>
     <<<nblks, TPB, 0, stream>>>(out, in, len, map);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -57,19 +58,19 @@ template <typename T>
 // Or else, we get the following compilation error
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
-template <typename T>
-void mapReduceLaunch(T *out_ref, T *out, const T *in, size_t len,
-                     cudaStream_t stream) {
-  auto op = [] __device__(T in) { return in; };
+template <typename InType, typename OutType>
+void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in,
+                     size_t len, cudaStream_t stream) {
+  auto op = [] __device__(InType in) { return in; };
   naiveMapReduce(out_ref, in, len, op, stream);
   mapThenSumReduce(out, len, op, 0, in);
 }
 
-template <typename T>
-class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<T>> {
+template <typename InType, typename OutType>
+class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<MapReduceInputs<T>>::GetParam();
+    params = ::testing::TestWithParam<MapReduceInputs<InType>>::GetParam();
     raft::random::Rng r(params.seed);
     auto len = params.len;
 
@@ -78,7 +79,7 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<T>> {
     allocate(in, len);
     allocate(out_ref, len);
     allocate(out, len);
-    r.uniform(in, len, T(-1.0), T(1.0), stream);
+    r.uniform(in, len, InType(-1.0), InType(1.0), stream);
     mapReduceLaunch(out_ref, out, in, len, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
@@ -90,29 +91,98 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<T>> {
   }
 
  protected:
-  MapReduceInputs<T> params;
-  T *in, *out_ref, *out;
+  MapReduceInputs<InType> params;
+  InType *in;
+  OutType *out_ref, *out;
 };
 
 const std::vector<MapReduceInputs<float>> inputsf = {
   {0.001f, 1024 * 1024, 1234ULL}};
-typedef MapReduceTest<float> MapReduceTestF;
-TEST_P(MapReduceTestF, Result) {
+typedef MapReduceTest<float, float> MapReduceTestFF;
+TEST_P(MapReduceTestFF, Result) {
   ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
                           CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestF,
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF,
+                         ::testing::ValuesIn(inputsf));
+
+typedef MapReduceTest<float, double> MapReduceTestFD;
+TEST_P(MapReduceTestFD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD,
                          ::testing::ValuesIn(inputsf));
 
 const std::vector<MapReduceInputs<double>> inputsd = {
   {0.000001, 1024 * 1024, 1234ULL}};
-typedef MapReduceTest<double> MapReduceTestD;
-TEST_P(MapReduceTestD, Result) {
+typedef MapReduceTest<double, double> MapReduceTestDD;
+TEST_P(MapReduceTestDD, Result) {
   ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
                           CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestD,
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD,
                          ::testing::ValuesIn(inputsd));
 
+template <typename T>
+class MapGenericReduceTest : public ::testing::Test {
+  using InType = typename T::first_type;
+  using OutType = typename T::second_type;
+
+ protected:
+  MapGenericReduceTest()
+    : allocator(handle.get_device_allocator()),
+      input(allocator, handle.get_stream(), n),
+      output(allocator, handle.get_stream(), 1) {
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    handle.set_stream(stream);
+    initInput(input.data(), input.size(), stream);
+  }
+
+  void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
+
+ public:
+  void initInput(InType *input, int n, cudaStream_t stream) {
+    raft::random::Rng r(137);
+    r.uniform(input, n, InType(2), InType(3), stream);
+    InType val = 1;
+    raft::update_device(input + 42, &val, 1, stream);
+    val = 5;
+    raft::update_device(input + 337, &val, 1, stream);
+  }
+
+  void testMin() {
+    auto op = [] __device__(InType in) { return in; };
+    const OutType neutral = std::numeric_limits<InType>::max();
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream,
+                  input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1,
+                                  raft::Compare<OutType>()));
+  }
+  void testMax() {
+    auto op = [] __device__(InType in) { return in; };
+    const OutType neutral = std::numeric_limits<InType>::min();
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream,
+                  input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1,
+                                  raft::Compare<OutType>()));
+  }
+
+ protected:
+  int n = 1237;
+  raft::handle_t handle;
+  cudaStream_t stream;
+  std::shared_ptr<raft::mr::device::allocator> allocator;
+  raft::mr::device::buffer<InType> input;
+  raft::mr::device::buffer<OutType> output;
+};
+
+using IoTypePair =
+  ::testing::Types<std::pair<float, float>, std::pair<float, double>,
+                   std::pair<double, double>>;
+
+TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair);
+TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); }
+TYPED_TEST(MapGenericReduceTest, max) { this->testMax(); }
 }  // end namespace linalg
 }  // end namespace raft