From 98aceba9c1354cb007c10f556caffdc8dbc89a7d Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 13 Feb 2020 19:31:26 -0800
Subject: [PATCH 01/64] Reapplying PR #17767

---
 3rdparty/mshadow/mshadow/base.h               |  48 ---
 3rdparty/mshadow/mshadow/half2.h              | 162 ---------
 src/common/cuda_vectorization.cuh             | 283 +++++++++++++++
 src/operator/mshadow_op.h                     |  67 ----
 .../linalg/broadcast_reduce_customized-inl.h  |   4 +
 src/operator/numpy/np_diff-inl.h              |   4 +-
 src/operator/tensor/broadcast_reduce-inl.cuh  | 300 +++++++++++++---
 src/operator/tensor/broadcast_reduce-inl.h    | 287 +++++++++++-----
 src/operator/tensor/broadcast_reduce_op.h     |   4 +-
 .../tensor/elemwise_binary_broadcast_op.h     | 176 +---------
 src/operator/tensor/elemwise_binary_op.cuh    | 322 ++++++++++++++++++
 src/operator/tensor/elemwise_binary_op.h      | 205 ++++++-----
 .../tensor/elemwise_binary_op_basic.cu        |  23 +-
 .../tensor/elemwise_binary_scalar_op.cuh      | 210 ++++++++++++
 .../tensor/elemwise_binary_scalar_op.h        |  83 +++--
 .../tensor/elemwise_binary_scalar_op_basic.cu |   9 +-
 .../elemwise_binary_scalar_op_extended.cu     |  15 +-
 src/operator/tensor/elemwise_sum.cu           | 112 +++++-
 src/operator/tensor/elemwise_sum.h            |  12 -
 src/operator/tensor/elemwise_unary_op.cuh     | 127 +++++++
 src/operator/tensor/elemwise_unary_op.h       |  56 +--
 .../tensor/elemwise_unary_op_basic.cu         |   1 +
 src/operator/tensor/elemwise_unary_op_pow.cu  |   1 +
 src/operator/tensor/elemwise_unary_op_trig.cu |   1 +
 tests/python/unittest/test_operator.py        |  78 +++++
 25 files changed, 1807 insertions(+), 783 deletions(-)
 delete mode 100644 3rdparty/mshadow/mshadow/half2.h
 create mode 100644 src/common/cuda_vectorization.cuh
 create mode 100644 src/operator/tensor/elemwise_binary_op.cuh
 create mode 100644 src/operator/tensor/elemwise_binary_scalar_op.cuh
 create mode 100644 src/operator/tensor/elemwise_unary_op.cuh

diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index 845ed35cf24f..1a4d5cec52e8 100644
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -272,7 +272,6 @@ extern "C" {
   }
 
 #include "./half.h"
-#include "./half2.h"
 #include "./bfloat.h"
 #define MSHADOW_HALF_BF_OPERATOR(RTYPE, OP)                                               \
   MSHADOW_XINLINE RTYPE operator OP(mshadow::half::half_t a, mshadow::bfloat::bf16_t b) { \
@@ -387,11 +386,6 @@ struct DataType<half::half_t> {
 #endif
 };
 template<>
-struct DataType<half::half2_t> {
-  static const int kFlag = kFloat16;
-  static const int kLanes = 2;
-};
-template<>
 struct DataType<bfloat::bf16_t> {
   static const int kFlag = kBfloat16;
   static const int kLanes = 1;
@@ -1144,48 +1138,6 @@ struct minimum {
   }
 #endif
 
-#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...)  \
-  switch (type) {                                         \
-  case mshadow::kFloat32:                                 \
-    {                                                     \
-      typedef float DType;                                \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kFloat64:                                 \
-    {                                                     \
-      typedef double DType;                               \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kFloat16:                                 \
-    {                                                     \
-      typedef mshadow::half::half2_t DType;               \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kUint8:                                   \
-    {                                                     \
-      typedef uint8_t DType;                              \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kInt32:                                   \
-    {                                                     \
-      typedef int32_t DType;                              \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kInt64:                                   \
-    {                                                     \
-      typedef int64_t DType;                              \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  default:                                                \
-    LOG(FATAL) << "Unknown type enum " << type;           \
-  }
-
 #define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...)  \
   switch (type) {                                      \
   case mshadow::kFloat32:                              \
diff --git a/3rdparty/mshadow/mshadow/half2.h b/3rdparty/mshadow/mshadow/half2.h
deleted file mode 100644
index cecc5449383c..000000000000
--- a/3rdparty/mshadow/mshadow/half2.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file half2.h
- * \brief definition of vector float16, half2 type.
- *
- * \author Antti-Pekka Hynninen
- */
-#ifndef MSHADOW_HALF2_H_
-#define MSHADOW_HALF2_H_
-
-#if (defined(__CUDACC__) && __CUDA_ARCH__ >= 530 && MSHADOW_USE_CUDA && CUDA_VERSION >= 7050)
-  #define MSHADOW_CUDA_HALF2 1
-  #include <cuda_fp16.h>
-#else
-  #define MSHADOW_CUDA_HALF2 0
-#endif
-
-#include<math.h>
-
-/*! \brief namespace for mshadow */
-namespace mshadow {
-/* \brief name space for host/device portable half-precision floats */
-namespace half {
-
-#define MSHADOW_HALF2_ASSIGNOP(AOP, OP)                                   \
-  template<typename T>                                                    \
-  MSHADOW_XINLINE half2_t operator AOP (const T& a) {                     \
-    return *this = half2_t(*this OP a);  /* NOLINT(*)*/                   \
-  }                                                                       \
-
-class MSHADOW_ALIGNED(4) half2_t {
- public:
-#if MSHADOW_CUDA_HALF2
-  half2 half2_;
-#else
-  half_t half_t2[2];
-#endif
-
-  MSHADOW_XINLINE half2_t() {}
-
-#if MSHADOW_CUDA_HALF2
-  MSHADOW_XINLINE explicit half2_t(half2 a) : half2_(a) {}
-#else
-  MSHADOW_XINLINE explicit half2_t(half_t a, half_t b) {
-    half_t2[0] = a;
-    half_t2[1] = b;
-  }
-#endif
-
-  MSHADOW_XINLINE explicit half2_t(int a) {
-#if MSHADOW_CUDA_HALF2
-    half2_ = __half2half2(__int2half_rz(a));
-#else
-    half_t2[0] = (half_t)a;
-    half_t2[1] = (half_t)a;
-#endif
-  }
-
-  MSHADOW_XINLINE half2_t operator+() {
-    return *this;
-  }
-
-  MSHADOW_XINLINE half2_t operator-() {
-#if MSHADOW_CUDA_HALF2
-    return half2_t(__hneg2(half2_));
-#else
-    return half2_t(-half_t2[0], -half_t2[1]);
-#endif
-  }
-
-  MSHADOW_XINLINE half2_t operator=(const half2_t& a) {
-#if MSHADOW_CUDA_HALF2
-    half2_ = a.half2_;
-#else
-    half_t2[0] = a.half_t2[0];
-    half_t2[1] = a.half_t2[1];
-#endif
-    return a;
-  }
-
-  MSHADOW_HALF2_ASSIGNOP(+=, +)
-  MSHADOW_HALF2_ASSIGNOP(-=, -)
-  MSHADOW_HALF2_ASSIGNOP(*=, *)
-  MSHADOW_HALF2_ASSIGNOP(/=, /)
-};
-
-/*! \brief overloaded + operator for half2_t */
-MSHADOW_XINLINE half2_t operator+(half2_t a, half2_t b) {
-#if MSHADOW_CUDA_HALF2
-  return half2_t(__floats2half2_rn(__low2float(a.half2_) + __low2float(b.half2_),
-                                   __high2float(a.half2_) + __high2float(b.half2_)));
-#else
-  return half2_t(a.half_t2[0] + b.half_t2[0], a.half_t2[1] + b.half_t2[1]);
-#endif
-}
-/*! \brief overloaded - operator for half2_t */
-MSHADOW_XINLINE half2_t operator-(half2_t a, half2_t b) {
-#if MSHADOW_CUDA_HALF2
-  return half2_t(__floats2half2_rn(__low2float(a.half2_) - __low2float(b.half2_),
-                                   __high2float(a.half2_) - __high2float(b.half2_)));
-#else
-  return half2_t(a.half_t2[0] - b.half_t2[0], a.half_t2[1] - b.half_t2[1]);
-#endif
-}
-/*! \brief overloaded * operator for half2_t */
-MSHADOW_XINLINE half2_t operator*(half2_t a, half2_t b) {
-#if MSHADOW_CUDA_HALF2
-  return half2_t(__floats2half2_rn(__low2float(a.half2_) * __low2float(b.half2_),
-                                   __high2float(a.half2_) * __high2float(b.half2_)));
-#else
-  return half2_t(a.half_t2[0] * b.half_t2[0], a.half_t2[1] * b.half_t2[1]);
-#endif
-}
-/*! \brief overloaded / operator for half2_t */
-MSHADOW_XINLINE half2_t operator/(half2_t a, half2_t b) {
-#if MSHADOW_CUDA_HALF2
-  return half2_t(__floats2half2_rn(__low2float(a.half2_) / __low2float(b.half2_),
-                                   __high2float(a.half2_) / __high2float(b.half2_)));
-#else
-  return half2_t(a.half_t2[0] / b.half_t2[0], a.half_t2[1] / b.half_t2[1]);
-#endif
-}
-/*! \brief overloaded % operator for half2_t */
-MSHADOW_XINLINE half2_t operator%(half2_t a, half2_t b) {
-#if MSHADOW_CUDA_HALF2
-  return half2_t(__floats2half2_rn(::fmod(__low2float(a.half2_), __low2float(b.half2_)),
-                                   ::fmod(__high2float(a.half2_), __high2float(b.half2_))));
-#else
-  return half2_t(::fmod(a.half_t2[0], b.half_t2[0]), ::fmod(a.half_t2[1], b.half_t2[1]));
-#endif
-}
-/*! \brief overloaded == operator for half2_t */
-MSHADOW_XINLINE bool operator==(half2_t a, half2_t b) {
-#if MSHADOW_CUDA_HALF2
-  return __hbeq2(a.half2_, b.half2_);
-#else
-  return (a.half_t2[0] == b.half_t2[0] && a.half_t2[1] == b.half_t2[1]);
-#endif
-}
-
-}  // namespace half
-}  // namespace mshadow
-#endif  // MSHADOW_HALF2_H_
diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda_vectorization.cuh
new file mode 100644
index 000000000000..7803afb901ab
--- /dev/null
+++ b/src/common/cuda_vectorization.cuh
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file cuda_vectorization.cuh
+ * \brief GPU helpers for vectorized memory accesses
+ */
+
+#ifndef MXNET_COMMON_CUDA_VECTORIZATION_CUH_
+#define MXNET_COMMON_CUDA_VECTORIZATION_CUH_
+
+#if MXNET_USE_CUDA && __CUDACC__
+
+#include <cuda_runtime.h>
+#include "cuda_utils.h"
+
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+
+/* \brief Helper class that enables storing multiple values of type DType
+          as 1 value of type LType.
+*/
+template <typename DType, typename LType>
+class VectorizedStorage {
+ public:
+  constexpr static int nvec = sizeof(LType) / sizeof(DType);
+  union vectorized_storage {
+    LType aligned;
+    DType separate[nvec];  // NOLINT(*)
+
+    MSHADOW_XINLINE vectorized_storage() {}
+    MSHADOW_XINLINE ~vectorized_storage() {}
+  } scratch_;
+};
+
+/* \brief Helper class that enables accessing multiple values of type DType
+          as 1 value of type LType. Additional aligned template argument
+          allows performance optimizations if the pointer and the size of
+          the allocation is aligned to sizeof(LType) / sizeof(DType) elements.
+*/
+template <typename DType, typename LType, bool aligned = false>
+class VectorizedAccessor {
+ public:
+  using StorageType = VectorizedStorage<typename std::remove_const<DType>::type,
+                                        typename std::remove_const<LType>::type>;
+  StorageType storage_;
+
+  LType* aligned_ptr_;
+  DType* unaligned_ptr_;
+  int alignment_;
+  index_t n_elems_;
+
+  MSHADOW_XINLINE VectorizedAccessor(DType* ptr, const index_t size) {
+    unaligned_ptr_ = ptr;
+    if (aligned) {
+      alignment_ = 0;
+      aligned_ptr_ = reinterpret_cast<LType*>(ptr);
+      n_elems_ = (size + storage_.nvec - 1) / storage_.nvec;
+    } else {
+      size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
+      alignment_ = (ptr_as_number % sizeof(LType)) / sizeof(DType);
+      aligned_ptr_ = reinterpret_cast<LType*>(ptr - alignment_);
+      n_elems_ = (size + alignment_ + storage_.nvec - 1) / storage_.nvec;
+    }
+  }
+
+  /* \brief Alignment of the input pointer in elements. */
+  MSHADOW_XINLINE int alignment() const {
+    return alignment_;
+  }
+
+  /* \brief Access to separate elements. */
+  MSHADOW_XINLINE DType* separate() {
+    return storage_.scratch_.separate;
+  }
+
+  /* \brief Number of elements stored. */
+  MSHADOW_XINLINE constexpr int nvec() const {
+    return storage_.nvec;
+  }
+
+  /* \brief Number of aligned elements that span the entire input tensor. */
+  MSHADOW_XINLINE index_t num_aligned_elements() const {
+    return n_elems_;
+  }
+
+  /* \brief Load values from the input.
+     \param id Aligned index of the element.
+     \param N size of the tensor.
+  */
+  MSHADOW_XINLINE void load(const index_t id, const index_t N) {
+    if (aligned) {
+      storage_.scratch_.aligned = aligned_ptr_[id];
+    } else {
+      if (id > 0 && id < n_elems_ - 1) {
+        storage_.scratch_.aligned = aligned_ptr_[id];
+      } else {
+#pragma unroll
+        for (int j = 0; j < storage_.nvec; ++j) {
+          DType* ptr = reinterpret_cast<DType*>(&(aligned_ptr_[id])) + j;
+          if (reinterpret_cast<size_t>(ptr) >= reinterpret_cast<size_t>(unaligned_ptr_) &&
+              reinterpret_cast<size_t>(ptr) < reinterpret_cast<size_t>(unaligned_ptr_ + N)) {
+            storage_.scratch_.separate[j] = *ptr;
+          }
+        }
+      }
+    }
+  }
+};
+
+/* \brief Class used for vectorized read-only access. */
+template <typename DType, typename LType, bool aligned = false>
+class VectorizedLoader : public VectorizedAccessor<const DType, const LType, aligned> {
+ public:
+  MSHADOW_XINLINE VectorizedLoader(const DType* ptr, const index_t N) :
+    VectorizedAccessor<const DType, const LType, aligned>(ptr, N) {
+  }
+};
+
+/* \brief Class used for vectorized writable access. */
+template <typename DType, typename LType, bool aligned = false>
+class VectorizedStorer : public VectorizedAccessor<DType, LType, aligned> {
+ public:
+  MSHADOW_XINLINE VectorizedStorer(DType* ptr, const index_t N) :
+    VectorizedAccessor<DType, LType, aligned>(ptr, N) {
+  }
+
+  /* \brief Store values to the output.
+     \param id Aligned index of the element.
+     \param N size of the tensor.
+  */
+  MSHADOW_XINLINE void store(const index_t id, const index_t N) {
+    if (aligned) {
+      this->aligned_ptr_[id] = this->storage_.scratch_.aligned;
+    } else {
+      if (id > 0 && id < this->n_elems_ - 1) {
+        this->aligned_ptr_[id] = this->storage_.scratch_.aligned;
+      } else {
+#pragma unroll
+        for (int j = 0; j < this->storage_.nvec; ++j) {
+          DType* ptr = reinterpret_cast<DType*>(&(this->aligned_ptr_[id])) + j;
+          if (reinterpret_cast<size_t>(ptr) >= reinterpret_cast<size_t>(this->unaligned_ptr_) &&
+              reinterpret_cast<size_t>(ptr) < reinterpret_cast<size_t>(this->unaligned_ptr_ + N)) {
+            *ptr = this->storage_.scratch_.separate[j];
+          }
+        }
+      }
+    }
+  }
+};
+
+namespace {
+
+enum class Alignment {
+  SAME_ALIGNED,  // All tensors aligned
+  SAME_UNALIGNED,  // All tensors have the same misalignment
+  DIFFERENT  // Tensors have different alignment
+};
+
+template <typename LType, typename DType>
+int CalcAlignment(const DType* ptr) {
+  size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
+  return ptr_as_number % sizeof(LType);
+}
+
+/* \brief Check alignment of the inputs and outputs when cast to LType*.
+   \param params Structuce containing arrays with inputs' and outputs' pointers
+   \param lead_dim Leading dimension of the tensors.
+   \param other_dim The size of the other dimensions of the tensors.
+*/
+template <typename LType, typename DType, typename Params>
+Alignment CheckAlignment(const Params& params, const index_t lead_dim, const index_t other_dim) {
+  int align = -1;
+  constexpr int nvec = sizeof(LType) / sizeof(DType);
+
+  for (const DType* ptr : params.inputs) {
+    int new_align = CalcAlignment<LType>(ptr);
+    if (align == -1) {
+      align = new_align;
+    } else {
+      if (align != new_align) {
+        return Alignment::DIFFERENT;
+      }
+    }
+  }
+
+  for (const DType* ptr : params.outputs) {
+    int new_align = CalcAlignment<LType>(ptr);
+    if (align == -1) {
+      align = new_align;
+    } else {
+      if (align != new_align) {
+        return Alignment::DIFFERENT;
+      }
+    }
+  }
+
+  if ((other_dim != 1) &&
+      (lead_dim % nvec != 0)) {
+    return Alignment::DIFFERENT;
+  }
+
+  if ((align == 0) &&
+      (lead_dim % nvec == 0)) {
+    return Alignment::SAME_ALIGNED;
+  } else {
+    return Alignment::SAME_UNALIGNED;
+  }
+}
+
+constexpr int vectorized_kernel_thread_num = 512;
+
+}  // namespace
+
+/* \brief Helper launcher function for the vectorized kernels. Checks for alignment of the
+          input and output tensors and launches a proper template.
+   \param lead_dim Leading dimension of the tensors.
+   \param other_dim The size of the other dimensions.
+   \param s Stream which should be used for launching the kernel.
+   \param params Input parameters to the kernel. Needs to contain at least 2 arrays of DType*:
+                 inputs and outputs, which contain input and output pointers.
+*/
+template <typename DType, typename LType, typename Kernel>
+void VectorizedKernelLauncher(const index_t lead_dim,
+                              const index_t other_dim,
+                              mshadow::Stream<gpu>* s,
+                              typename Kernel::ParamType params) {
+  static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type");
+  if (lead_dim * other_dim != 0) {
+    cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+    VectorizedLoader<DType, LType> l(params.inputs[0], lead_dim);
+    size_t num_elements = other_dim * l.num_aligned_elements();
+    constexpr int threads = vectorized_kernel_thread_num;
+    constexpr int max_blocks = 65535;
+    index_t blocks = std::min(static_cast<int>((num_elements + threads - 1) / threads),
+                              max_blocks);
+    auto align = CheckAlignment<LType, DType>(params, lead_dim, other_dim);
+    switch (align) {
+      case Alignment::SAME_ALIGNED:
+        Kernel::template Launch<true, LType>(blocks, threads, stream, params, lead_dim, other_dim);
+        break;
+      case Alignment::SAME_UNALIGNED:
+        Kernel::template Launch<false, LType>(blocks, threads, stream, params, lead_dim, other_dim);
+        break;
+      case Alignment::DIFFERENT: {
+        const index_t size = lead_dim * other_dim;
+        index_t blocks = std::min(static_cast<int>((size + threads - 1) /
+                                                   threads),
+                                  max_blocks);
+        // If the pointers are aligned differently we cannot vectorize
+        Kernel::template Launch<true, DType>(blocks, threads, stream, params, lead_dim, other_dim);
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA && __CUDACC__
+
+#endif  // MXNET_COMMON_CUDA_VECTORIZATION_CUH_
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 55f26b08fcc1..cb94b69c6bae 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -746,22 +746,8 @@ MXNET_BINARY_MATH_OP_NC(negone, -1);
 
 MXNET_BINARY_MATH_OP(div_grad, 1.0f / math::id(b));
 
-template<>
-MSHADOW_XINLINE mshadow::half::half2_t div_grad::Map<mshadow::half::half2_t>
-                                               (mshadow::half::half2_t a,
-                                                mshadow::half::half2_t b) {
-  return mshadow::half::half2_t(1) / b;
-}
-
 MXNET_BINARY_MATH_OP(div_rgrad, -math::id(a) / math::sqr(b));
 
-template<>
-MSHADOW_XINLINE mshadow::half::half2_t div_rgrad::Map<mshadow::half::half2_t>
-                                               (mshadow::half::half2_t a,
-                                                mshadow::half::half2_t b) {
-  return -a / (b * b);
-}
-
 MXNET_BINARY_MATH_OP(rdiv, math::id(b) / math::id(a));
 
 MXNET_BINARY_MATH_OP(rdiv_grad, -math::id(b) / math::sqr(a));
@@ -879,13 +865,6 @@ struct rfmod : public mxnet_op::tunable {
   }
 };
 
-template<>
-MSHADOW_XINLINE mshadow::half::half2_t mod::Map<mshadow::half::half2_t>
-                                               (mshadow::half::half2_t a,
-                                                mshadow::half::half2_t b) {
-  return a%b;
-}
-
 struct mod_grad : public mxnet_op::tunable  {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
@@ -907,19 +886,6 @@ MSHADOW_XINLINE mshadow::half::half_t mod_grad::Map<mshadow::half::half_t>
                                                     mshadow::half::half_t b) {
   return mshadow::half::half_t(1.0f);
 }
-template<>
-MSHADOW_XINLINE mshadow::half::half2_t mod_grad::Map<mshadow::half::half2_t>
-                                                    (mshadow::half::half2_t a,
-                                                     mshadow::half::half2_t b) {
-  mshadow::half::half2_t result = mshadow::half::half2_t();
-#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
-  result.half2_ = ::__float2half2_rn(1.0f);
-#else
-  result.half_t2[0] = mshadow::half::half_t(0.0f);
-  result.half_t2[1] = mshadow::half::half_t(1.0f);
-#endif
-  return result;
-}
 
 struct mod_rgrad : public mxnet_op::tunable {
   template<typename DType>
@@ -942,19 +908,6 @@ MSHADOW_XINLINE mshadow::half::half_t mod_rgrad::Map<mshadow::half::half_t>
                                                      mshadow::half::half_t b) {
   return mshadow::half::half_t(-::floorf(static_cast<float>(a/b)));
 }
-template<>
-MSHADOW_XINLINE mshadow::half::half2_t mod_rgrad::Map<mshadow::half::half2_t>
-                                                     (mshadow::half::half2_t a,
-                                                      mshadow::half::half2_t b) {
-#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
-  return mshadow::half::half2_t(__hneg2(::h2floor((a/b).half2_)));
-#else
-  return mshadow::half::half2_t(mshadow::half::half_t(-::floorf(
-                                  static_cast<float>(a.half_t2[0]/b.half_t2[0]))),
-                                mshadow::half::half_t(-::floorf(
-                                  static_cast<float>(a.half_t2[1]/b.half_t2[1]))));
-#endif
-}
 
 struct rmod : public mxnet_op::tunable {
   template<typename DType>
@@ -991,13 +944,6 @@ struct rmod : public mxnet_op::tunable {
   }
 };
 
-template<>
-MSHADOW_XINLINE mshadow::half::half2_t rmod::Map<mshadow::half::half2_t>
-                                                (mshadow::half::half2_t a,
-                                                 mshadow::half::half2_t b) {
-  return b%a;
-}
-
 struct rmod_grad {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
@@ -1019,19 +965,6 @@ MSHADOW_XINLINE mshadow::half::half_t rmod_grad::Map<mshadow::half::half_t>
                                                     mshadow::half::half_t b) {
   return mshadow::half::half_t(-::floorf(static_cast<float>(b/a)));
 }
-template<>
-MSHADOW_XINLINE mshadow::half::half2_t rmod_grad::Map<mshadow::half::half2_t>
-                                                     (mshadow::half::half2_t a,
-                                                      mshadow::half::half2_t b) {
-#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
-  return mshadow::half::half2_t(::__hneg2(::h2floor((b/a).half2_)));
-#else
-  return mshadow::half::half2_t(mshadow::half::half_t(-::floorf(
-                                  static_cast<float>(b.half_t2[0]/a.half_t2[0]))),
-                                mshadow::half::half_t(-::floorf(
-                                  static_cast<float>(b.half_t2[1]/a.half_t2[1]))));
-#endif
-}
 
 struct clip : public mxnet_op::tunable {
   template<typename DType>
diff --git a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h
index 2b5970d4f4ae..0226df45f960 100644
--- a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h
+++ b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.h
@@ -31,6 +31,10 @@ namespace mxnet {
 namespace op {
 namespace broadcast {
 using namespace mshadow;
+using mxnet_op::unravel;
+using mxnet_op::ravel;
+using mxnet_op::dot;
+using mxnet_op::unravel_dot;
 
 template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 MSHADOW_XINLINE void seq_reduce_assign_wr(const index_t idx, const size_t M, const bool addto,
diff --git a/src/operator/numpy/np_diff-inl.h b/src/operator/numpy/np_diff-inl.h
index 8a8bc558962a..3d80e2d941c8 100644
--- a/src/operator/numpy/np_diff-inl.h
+++ b/src/operator/numpy/np_diff-inl.h
@@ -73,7 +73,7 @@ struct diff_forward {
                                   const int stride,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> ishape) {
-    using namespace broadcast;
+    using namespace mxnet_op;
 
     // j represent the memory index of the corresponding input entry
     int j = ravel(unravel(i, oshape), ishape);
@@ -145,7 +145,7 @@ struct diff_backward {
                                   const int stride, const int axis,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> ishape) {
-    using namespace broadcast;
+    using namespace mxnet_op;
     if (n == 0) {
       igrad[i] = ograd[i];
       return;
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 379443dc1688..790f1afb857c 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -18,57 +18,253 @@
  */
 
 /*!
- * Copyright (c) 2015-2017 by Contributors
+ * Copyright (c) 2015-2020 by Contributors
  * \file broadcast_reduce-inl.cuh
  * \brief CUDA implementations for binary broadcast and reduce
- * \author Antti-Pekka Hynninen
+ * \author Antti-Pekka Hynninen, Przemyslaw Tredak
 */
 #ifndef MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_
 #define MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_
 
 using namespace mshadow::cuda;
 
-template<int ndim, typename DType, typename OP, int unroll>
-__launch_bounds__(kMaxThreadsPerBlock)
-__global__ void binary_broadcast_kernel(const int N, const bool addto,
-                                        const DType* __restrict lhs,
-                                        const DType* __restrict rhs, DType *out,
-                                        const Shape<ndim> lstride, const Shape<ndim> rstride,
-                                        const Shape<ndim> oshape) {
-  for (int idx = blockIdx.x * blockDim.x * unroll + threadIdx.x; idx < N;
-    idx += blockDim.x * gridDim.x * unroll)
-  {
-    int j[unroll];
-    int k[unroll];
-    DType val[unroll];
-    #pragma unroll
-    for (int i=0;i < unroll;i++) {
-      unravel_dot(idx + i*blockDim.x, oshape, lstride, rstride, &j[i], &k[i]);
-      val[i] = OP::Map(lhs[j[i]], rhs[k[i]]);
+template <int ndim, typename DType>
+struct VectorizedBinaryBroadcastParam {
+  const DType* inputs[2];
+  DType* outputs[1];
+  Shape<ndim> stride[2];
+  Shape<ndim> oshape;
+  index_t size[2];
+};
+
+using common::cuda::VectorizedLoader;
+using common::cuda::VectorizedStorer;
+
+template <bool aligned, typename DType, typename LType, typename OP, int ndim, int req>
+__global__ void VectorizedBinaryBroadcastKernel(
+    const VectorizedBinaryBroadcastParam<ndim, DType> param,
+    const index_t lead_dim, const index_t other_dim,
+    const index_t num_aligned_elements) {
+  constexpr int nvec = sizeof(LType) / sizeof(DType);
+  const index_t M = num_aligned_elements * other_dim;
+  const index_t N = lead_dim * other_dim;
+
+  VectorizedLoader<DType, LType, aligned> lloader(param.inputs[0], param.size[0]);
+  VectorizedLoader<DType, LType, aligned> rloader(param.inputs[1], param.size[1]);
+
+  for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < M;
+       idx += gridDim.x * blockDim.x) {
+    DType * current_output_pointer;
+    index_t output_size;
+    index_t output_idx;
+    if (aligned) {
+      // Simplified case
+      index_t lindex, rindex;
+      unravel_dot(idx * nvec, param.oshape,
+                  param.stride[0], param.stride[1],
+                  &lindex, &rindex);
+      lloader.load(lindex / nvec, param.size[0]);
+      rloader.load(rindex / nvec, param.size[1]);
+      current_output_pointer = param.outputs[0];
+      output_size = N;
+      output_idx = idx;
+    } else {
+      const index_t row = idx / num_aligned_elements;
+      const index_t lead_dim_idx = idx - row * num_aligned_elements;
+
+      index_t lindex, rindex;
+      const index_t original_idx = max(lead_dim_idx * nvec - lloader.alignment(),
+                                       static_cast<index_t>(0)) +
+                                   row * lead_dim;
+      unravel_dot(original_idx, param.oshape,
+                  param.stride[0], param.stride[1],
+                  &lindex, &rindex);
+      lloader.load((lindex + lloader.alignment()) / nvec, param.size[0]);
+      rloader.load((rindex + lloader.alignment()) / nvec, param.size[1]);
+      current_output_pointer = param.outputs[0] + row * lead_dim;
+      output_size = lead_dim;
+      output_idx = lead_dim_idx;
     }
-    #pragma unroll
-    for (int i=0;i < unroll;i++) {
-      if (idx + i*blockDim.x < N) assign(&out[idx + i*blockDim.x], addto, val[i]);
+    VectorizedStorer<DType, LType, aligned> storer(current_output_pointer, output_size);
+
+    if (req == kAddTo) {
+      storer.load(output_idx, output_size);
     }
+#pragma unroll
+    for (int i = 0; i < lloader.nvec(); ++i) {
+      DType temp = OP::Map(lloader.separate()[i],
+                           rloader.separate()[i]);
 
+      if (req == kAddTo) {
+        storer.separate()[i] += temp;
+      } else {
+        storer.separate()[i] = temp;
+      }
+    }
+    storer.store(output_idx, output_size);
   }
 }
 
+template <bool aligned, typename DType, typename LType, typename OP, int ndim, int req, int side>
+__global__ void VectorizedBinaryBroadcastSingleSideKernel(
+    const VectorizedBinaryBroadcastParam<ndim, DType> param,
+    const index_t lead_dim, const index_t other_dim,
+    const index_t num_aligned_elements) {
+  constexpr int nvec = sizeof(LType) / sizeof(DType);
+  const index_t M = num_aligned_elements * other_dim;
+  const index_t N = lead_dim * other_dim;
+  constexpr int other_side = 1 - side;
+
+  VectorizedLoader<DType, LType, aligned> lloader(param.inputs[side], param.size[side]);
+
+  for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < M;
+       idx += gridDim.x * blockDim.x) {
+    index_t original_idx;
+    DType * current_output_pointer;
+    index_t output_size;
+    index_t output_idx;
+    if (aligned) {
+      //Simplified case
+      original_idx = idx * nvec;
+      const index_t lindex = mxnet_op::unravel_dot(original_idx, param.oshape,
+                                                   param.stride[side]);
+      lloader.load(lindex / nvec, param.size[side]);
+      current_output_pointer = param.outputs[0];
+      output_size = N;
+      output_idx = idx;
+    } else {
+      const index_t row = idx / num_aligned_elements;
+      const index_t lead_dim_idx = idx - row * num_aligned_elements;
+      original_idx = lead_dim_idx * nvec -
+                     lloader.alignment() + row * lead_dim;
+      const index_t original_idx_clamped = max(lead_dim_idx * nvec - lloader.alignment(),
+                                               static_cast<index_t>(0)) +
+                                           row * lead_dim;
+      const index_t lindex = mxnet_op::unravel_dot(original_idx_clamped, param.oshape,
+                                                   param.stride[side]);
+      lloader.load((lindex + lloader.alignment()) / nvec, param.size[side]);
+      current_output_pointer = param.outputs[0] + row * lead_dim;
+      output_size = lead_dim;
+      output_idx = lead_dim_idx;
+    }
+    VectorizedStorer<DType, LType, aligned> storer(current_output_pointer, output_size);
+
+    if (req == kAddTo) {
+      storer.load(output_idx, output_size);
+    }
+#pragma unroll
+    for (int i = 0; i < lloader.nvec(); ++i) {
+      const index_t rindex = min(max(mxnet_op::unravel_dot(original_idx + i,
+                                                           param.oshape,
+                                                           param.stride[other_side]),
+                                     static_cast<index_t>(0)),
+                                 param.size[other_side] - 1);
+      DType rinput = param.inputs[other_side][rindex];
+      DType temp;
+      if (side == 0) {
+        // Left side is vectorized
+        temp = OP::Map(lloader.separate()[i],
+                       rinput);
+      } else {
+        // Right side is vectorized
+        temp = OP::Map(rinput,
+                       lloader.separate()[i]);
+      }
+
+      if (req == kAddTo) {
+        storer.separate()[i] += temp;
+      } else {
+        storer.separate()[i] = temp;
+      }
+    }
+    storer.store(output_idx, output_size);
+  }
+}
+
+template <typename DType, typename OP, int req, int ndim>
+class VectorizedBinaryBroadcastFwd {
+ public:
+  using ParamType = VectorizedBinaryBroadcastParam<ndim, DType>;
+
+  template <bool aligned, typename LType>
+  static void Launch(const index_t blocks, const index_t threads,
+                     cudaStream_t stream,
+                     const ParamType params, const index_t lead_dim,
+                     const index_t other_dim) {
+    int common_shape = 1;
+    int first_different = -1;
+    for (int i = ndim - 1; i >= 0; --i) {
+      if (params.stride[0][i] == params.stride[1][i]) {
+        common_shape *= params.oshape[i];
+      } else {
+        first_different = i;
+        break;
+      }
+    }
+
+    if (common_shape != 1) {
+      VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], lead_dim);
+      const index_t num_elements_per_row = loader.num_aligned_elements();
+      VectorizedBinaryBroadcastKernel<aligned, DType, LType, OP, ndim, req>
+        <<<blocks, threads, 0, stream>>>(params, lead_dim, other_dim, num_elements_per_row);
+    } else {
+      if (params.stride[0][first_different] == 0) {
+        VectorizedLoader<DType, LType, aligned> loader(params.inputs[1], lead_dim);
+        const index_t num_elements_per_row = loader.num_aligned_elements();
+        VectorizedBinaryBroadcastSingleSideKernel<aligned, DType, LType, OP, ndim, req, 1>
+          <<<blocks, threads, 0, stream>>>(params, lead_dim, other_dim, num_elements_per_row);
+      } else {
+        VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], lead_dim);
+        const index_t num_elements_per_row = loader.num_aligned_elements();
+        VectorizedBinaryBroadcastSingleSideKernel<aligned, DType, LType, OP, ndim, req, 0>
+          <<<blocks, threads, 0, stream>>>(params, lead_dim, other_dim, num_elements_per_row);
+      }
+    }
+  }
+};
+
 template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<gpu> *s, const OpReqType req,
                                 const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
+  using common::cuda::VectorizedKernelLauncher;
   if (req == kNullOp) return;
   cudaStream_t stream = Stream<gpu>::GetStream(s);
-  int N = out.shape_.Size();
-  const int warpSize = 32;
-  const int unroll = 2;
-  int nthread = std::min(kMaxThreadsPerBlock, ((N + warpSize - 1)/warpSize)*warpSize );
-  int ngrid = std::min(kBaseGridNum, (N + nthread*unroll - 1) / (nthread*unroll));
-  Shape<ndim> lstride = calc_stride(lhs.shape_.get<ndim>());
-  Shape<ndim> rstride = calc_stride(rhs.shape_.get<ndim>());
-  binary_broadcast_kernel<ndim, DType, OP, unroll><<<ngrid, nthread, 0, stream>>>(
-    N, req == kAddTo, lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>(), lstride, rstride,
-    out.shape_.get<ndim>());
+  const index_t N = out.shape_.Size();
+
+  Shape<ndim> lstride = mxnet_op::calc_stride(lhs.shape_.get<ndim>());
+  Shape<ndim> rstride = mxnet_op::calc_stride(rhs.shape_.get<ndim>());
+
+  MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+    using LType = uint2;
+    using Kernel = VectorizedBinaryBroadcastFwd<DType, OP, Req, ndim>;
+
+    typename Kernel::ParamType param;
+
+    param.inputs[0] = lhs.dptr<DType>();
+    param.inputs[1] = rhs.dptr<DType>();
+    param.outputs[0] = out.dptr<DType>();
+    param.stride[0] = lstride;
+    param.stride[1] = rstride;
+    param.oshape = out.shape_.get<ndim>();
+    param.size[0] = lhs.shape_.Size();
+    param.size[1] = rhs.shape_.Size();
+
+    index_t lead_dim = 1;
+    for (int i = ndim - 1; i >= 0; --i) {
+      /* Find the first non-1 dimension
+         to check the alignment
+      */
+      if (param.oshape[i] != 1) {
+        lead_dim = param.oshape[i];
+        break;
+      }
+    }
+    const index_t other_dim = out.shape_.Size() / lead_dim;
+
+    VectorizedKernelLauncher<DType, LType, Kernel>(lead_dim, other_dim, s, param);
+  });
 }
 
 const int nthread_reduce = kMaxThreadsPerBlock;
@@ -92,8 +288,8 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
     const int Mend   = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext);
     for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) {
       int idx = idx0 + tidx;
-      Shape<ndim> coord = unravel(idx, small_shape);
-      int idx_big0 = ravel(coord, big_shape0);
+      Shape<ndim> coord = mxnet_op::unravel(idx, small_shape);
+      int idx_big0 = mxnet_op::ravel(coord, big_shape0);
 
       AType val, residual;
       Reducer::SetInitValue(val, residual);
@@ -102,7 +298,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
           int idx_big[unroll];
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride);
+            idx_big[u] = idx_big0 + mxnet_op::unravel_dot(k + u*by, big_shape, big_stride);
           }
           DType tmp[unroll];
           #pragma unroll
@@ -175,10 +371,10 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
     const int Mend   = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext);
     for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) {
       int idx = idx0 + tidx;
-      Shape<ndim> coord = unravel(idx, small_shape);
-      int idx_big0 = ravel(coord, big_shape0);
-      int idx_lhs0 = ravel(coord, lhs_shape0);
-      int idx_rhs0 = ravel(coord, rhs_shape0);
+      Shape<ndim> coord = mxnet_op::unravel(idx, small_shape);
+      int idx_big0 = mxnet_op::ravel(coord, big_shape0);
+      int idx_lhs0 = mxnet_op::ravel(coord, lhs_shape0);
+      int idx_rhs0 = mxnet_op::ravel(coord, rhs_shape0);
 
       DType val, residual;
       Reducer::SetInitValue(val, residual);
@@ -189,9 +385,9 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
           int idx_rhs[unroll];
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride);
-            idx_lhs[u] = idx_lhs0 + unravel_dot(k + u*by, lhs_shape, lhs_stride);
-            idx_rhs[u] = idx_rhs0 + unravel_dot(k + u*by, rhs_shape, rhs_stride);
+            idx_big[u] = idx_big0 + mxnet_op::unravel_dot(k + u*by, big_shape, big_stride);
+            idx_lhs[u] = idx_lhs0 + mxnet_op::unravel_dot(k + u*by, lhs_shape, lhs_stride);
+            idx_rhs[u] = idx_rhs0 + mxnet_op::unravel_dot(k + u*by, rhs_shape, rhs_stride);
           }
           DType tmp[unroll];
           #pragma unroll
@@ -267,8 +463,8 @@ __global__ void reduce_kernel_M1(const int N, const bool addto,
                                 const DType* __restrict big, OType *small, const Shape<ndim> bshape,
                                 const Shape<ndim> sshape) {
   for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
-    Shape<ndim> coord = unravel(idx, sshape);
-    int j = ravel(coord, bshape);
+    Shape<ndim> coord = mxnet_op::unravel(idx, sshape);
+    int j = mxnet_op::ravel(coord, bshape);
     AType val, residual;
     Reducer::SetInitValue(val, residual);
     Reducer::Reduce(val, AType(OP::Map(big[j])), residual);
@@ -289,10 +485,10 @@ __global__ void reduce_kernel_M1(const int N, const bool addto,
                                  const Shape<ndim> rhs_shape,
                                  const Shape<ndim> small_shape) {
   for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
-    Shape<ndim> coord = unravel(idx, small_shape);
-    int idx_big = ravel(coord, big_shape);
-    int idx_lhs = ravel(coord, lhs_shape);
-    int idx_rhs = ravel(coord, rhs_shape);
+    Shape<ndim> coord = mxnet_op::unravel(idx, small_shape);
+    int idx_big = mxnet_op::ravel(coord, big_shape);
+    int idx_lhs = mxnet_op::ravel(coord, lhs_shape);
+    int idx_rhs = mxnet_op::ravel(coord, rhs_shape);
     DType val, residual;
     Reducer::SetInitValue(val, residual);
     Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual);
@@ -660,16 +856,16 @@ void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
 }
 
 template<int ndim, typename DType>
-size_t ReduceWorkspaceSize(Stream<gpu> *s, const mxnet::TShape& small, const OpReqType req,
-                           const mxnet::TShape& big) {
+size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
+                           const ::mxnet::TShape& big) {
   if (req == kNullOp) return 0;
   ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, nullptr, nullptr);
   return config.workspace_size;
 }
 
 template<int ndim, typename DType>
-size_t ReduceWorkspaceSize(Stream<gpu> *s, const mxnet::TShape& small, const OpReqType req,
-                           const mxnet::TShape& big, const mxnet::TShape& lhs, const mxnet::TShape& rhs) {
+size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
+                           const ::mxnet::TShape& big, const ::mxnet::TShape& lhs, const ::mxnet::TShape& rhs) {
   if (req == kNullOp) return 0;
   ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, &lhs, &rhs);
   return config.workspace_size;
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 841fbcd28a68..46bb5e79b262 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -31,27 +31,184 @@
 #include <string>
 #include <utility>
 #include "../mshadow_op.h"
+#include "../mxnet_op.h"
 #include "../operator_common.h"
+#if MXNET_USE_CUDA
+#include "../../common/cuda_vectorization.cuh"
+#endif
 
 namespace mxnet {
 namespace op {
+namespace mxnet_op {
+template<int ndim, typename OP>
+struct binary_broadcast_kernel {
+  /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, IType *lhs, IType *rhs,
+                                  DType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
+    }
+  }
+
+  /*! \brief Map function for binary_broadcast_kernel */
+  template<typename LType, typename RType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, LType *lhs, RType *rhs,
+                                  OType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
+    }
+  }
+
+  /*! \brief Map function for binary_broadcast_kernel */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, IType lhs, IType *rhs,
+                                  DType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
+    }
+  }
+
+  /*! \brief Map function for binary_broadcast_kernel */
+  /* used for mixed type binary ops */
+  template<typename IType, typename DType,
+           typename std::enable_if<!std::is_same<IType, DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, IType *lhs, DType *rhs,
+                                  DType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
+    }
+  }
+
+  /*! \brief Map function for binary_broadcast_kernel */
+  /* used for mixed type binary ops */
+  template<typename IType, typename DType,
+           typename std::enable_if<!std::is_same<IType, DType>::value &&
+                                   !std::is_pointer<IType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
+                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
+                                  const Shape <ndim> &oshape, IType lhs, DType *rhs,
+                                  DType *out) {
+    Shape <ndim> coord = unravel(base, oshape);
+    auto lidx = static_cast<index_t>(dot(coord, lstride));
+    auto ridx = static_cast<index_t>(dot(coord, rstride));
+    KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx]));
+    // starts from 1 to avoid extra inc at end of loop
+    for (index_t i = 1; i < length; ++i) {
+      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+      // When tuning, don't actually run the op, since it's not going to be tuned against
+      // the actual op we'll eventually be using
+      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
+    }
+  }
+};
+
+template<int req, typename OP, bool col_vec>
+struct csr_dns_csr_broadcast_kernel {
+  /*!
+   * \brief Map function for broadcast between csr and 1D vector
+   * \param row          global thread id/assigned row id
+   * \param csr_data     ptr to data buffer of csr matrix
+   * \param csr_indices  ptr to indices buffer of csr matrix
+   * \param csr_indptr   ptr to indptr buffer of csr matrix
+   * \param dns          ptr to data buffer of the dense vector
+   * \param out          ptr to the data buffer of the result csr matrix
+   */
+  template<typename DType, typename CType, typename RType>
+  MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices,
+                                  const RType *csr_indptr, const DType *dns, DType *out) {
+    const nnvm::dim_t curr_row_i = csr_indptr[row];
+    const nnvm::dim_t next_row_i = csr_indptr[row + 1];
+    for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) {
+      KERNEL_ASSIGN(out[iter], req, OP::Map(csr_data[iter],
+                    (col_vec)? dns[row] : dns[csr_indices[iter]]));
+    }
+  }
+
+  /*!
+   * \brief Map function for broadcast between csr and a scalar
+   * \param i           global thread id
+   * \param csr_data    ptr to data buffer of csr matrix
+   * \param scalar_ptr  ptr to data buffer of the scalar tensor, only the 0-th element is used
+   * \param out         ptr to the data buffer of output csr matrix
+   * \param nnz         number of non-zero elements in input csr matrix
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(index_t i, const DType *csr_data, const DType* scalar_ptr,
+                                  DType *out, const nnvm::dim_t nnz) {
+    const DType scale = scalar_ptr[0];
+    if (i < nnz) {
+      KERNEL_ASSIGN(out[i], req, OP::Map(csr_data[i], scale));
+    }
+  }
+};
+
+template<int req, typename OP, bool reverse = false>
+struct csr_dns_map_kernel {
+  template <typename DType, typename CType, typename RType>
+  MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices,
+                                  const RType *csr_indptr, DType *out, const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    if (row < num_rows) {
+      const nnvm::dim_t curr_row_i = csr_indptr[row];
+      const nnvm::dim_t next_row_i = csr_indptr[row + 1];
+      for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) {
+        const nnvm::dim_t target = row * num_cols + csr_indices[iter];
+        KERNEL_ASSIGN(out[target], req,
+                      reverse ? OP::Map(out[target], csr_data[iter]) :
+                                OP::Map(csr_data[iter], out[target]));
+      }
+    }
+  }
+};
+
+}  // namespace mxnet_op
+
 namespace broadcast {
 using namespace mshadow;
 
 const int MAX_DIM = 5;
 
-template<int ndim>
-MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
-  Shape<ndim> stride;
-  index_t cumprod = 1;
-  #pragma unroll
-  for (int i = ndim - 1; i >= 0; --i) {
-    stride[i] = (shape[i] > 1) ? cumprod : 0;
-    cumprod *= shape[i];
-  }
-  return stride;
-}
-
 template<int ndim>
 MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape<ndim>& shape,
   const Shape<ndim>& stridej, const Shape<ndim>& stridek, index_t* j, index_t* k) {
@@ -67,28 +224,6 @@ MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape<ndim>& shape,
   }
 }
 
-template<int ndim>
-MSHADOW_XINLINE Shape<ndim> unravel(const index_t idx, const Shape<ndim>& shape) {
-  Shape<ndim> ret;
-  #pragma unroll
-  for (index_t i = ndim-1, j = idx; i >=0; --i) {
-    auto tmp = j / shape[i];
-    ret[i] = j - tmp*shape[i];
-    j = tmp;
-  }
-  return ret;
-}
-
-template<int ndim>
-MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
-  index_t ret = 0;
-  #pragma unroll
-  for (index_t i = 0; i < ndim; ++i) {
-    ret = ret * shape[i] + (shape[i] > 1) * coord[i];
-  }
-  return ret;
-}
-
 template<int ndim>
 MSHADOW_XINLINE int diff(const Shape<ndim>& small,
                          const Shape<ndim>& big,
@@ -114,28 +249,6 @@ MSHADOW_XINLINE int diff(const Shape<ndim>& small,
   return mdim;
 }
 
-template<int ndim>
-MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape<ndim>& shape,
-  const Shape<ndim>& stride) {
-  index_t ret = 0;
-  #pragma unroll
-  for (index_t i = ndim-1, j = idx; i >=0; --i) {
-    auto tmp = j / shape[i];
-    ret += (j - tmp*shape[i])*stride[i];
-    j = tmp;
-  }
-  return ret;
-}
-
-template<int ndim>
-MSHADOW_XINLINE index_t dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
-  index_t ret = 0;
-  #pragma unroll
-  for (int i = 0; i < ndim; ++i)
-    ret += coord[i] * stride[i];
-  return ret;
-}
-
 template<typename DType>
 MSHADOW_XINLINE void assign(DType* dst, const bool addto, const DType src) {
   if (addto) {
@@ -151,9 +264,9 @@ MSHADOW_XINLINE void binary_broadcast_assign(const index_t idx, const bool addto
                                              const DType* __restrict rhs, DType* out,
                                              const Shape<ndim>& lshape, const Shape<ndim>& rshape,
                                              const Shape<ndim>& oshape) {
-  const Shape<ndim> coord = unravel(idx, oshape);
-  const index_t j = ravel(coord, lshape);
-  const index_t k = ravel(coord, rshape);
+  const Shape<ndim> coord = mxnet_op::unravel(idx, oshape);
+  const index_t j = mxnet_op::ravel(coord, lshape);
+  const index_t k = mxnet_op::ravel(coord, rshape);
   assign(&out[idx], addto, OP::Map(lhs[j], rhs[k]));
 }
 
@@ -162,13 +275,13 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const
                                        const DType* __restrict big, OType *small,
                                        const Shape<ndim>& bshape, const Shape<ndim>& sshape,
                                        const Shape<ndim>& rshape, const Shape<ndim>& rstride) {
-  Shape<ndim> coord = unravel(idx, sshape);
-  index_t j = ravel(coord, bshape);
+  Shape<ndim> coord = mxnet_op::unravel(idx, sshape);
+  index_t j = mxnet_op::ravel(coord, bshape);
   AType val, residual;
   Reducer::SetInitValue(val, residual);
   for (size_t k = 0; k < M; ++k) {
-    coord = unravel(k, rshape);
-    Reducer::Reduce(val, AType(OP::Map(big[j + dot(coord, rstride)])), residual);
+    coord = mxnet_op::unravel(k, rshape);
+    Reducer::Reduce(val, AType(OP::Map(big[j + mxnet_op::dot(coord, rstride)])), residual);
   }
   Reducer::Finalize(val, residual);
   assign(&small[idx], addto, OType(val));
@@ -179,23 +292,15 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const
 
 #else
 
-template<int ndim, typename DType, typename OP>
-void binary_broadcast_compute(const size_t N, const bool addto, const DType *lhs,
-                              const DType *rhs, DType *out, const Shape<ndim> lshape,
-                              const Shape<ndim> rshape, const Shape<ndim> oshape) {
-  for (size_t idx = 0; idx < N; ++idx) {
-    binary_broadcast_assign<ndim, DType, OP>(idx, addto, lhs, rhs, out, lshape, rshape, oshape);
-  }
-}
-
 template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
                                 const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
-  if (req == kNullOp) return;
-  size_t N = out.shape_.Size();
-  binary_broadcast_compute<ndim, DType, OP>(N, req == kAddTo, lhs.dptr<DType>(), rhs.dptr<DType>(),
-                           out.dptr<DType>(), lhs.shape_.get<ndim>(), rhs.shape_.get<ndim>(),
-                           out.shape_.get<ndim>());
+  mshadow::Shape<ndim> oshape = out.shape_.get<ndim>();
+  mshadow::Shape<ndim> lstride = mxnet_op::calc_stride(lhs.shape_.get<ndim>());
+  mshadow::Shape<ndim> rstride = mxnet_op::calc_stride(rhs.shape_.get<ndim>());
+  mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<ndim, OP>, cpu>::
+  template LaunchEx(s, out.shape_.Size(), req, lstride, rstride, oshape,
+                    lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>());
 }
 
 template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
@@ -220,8 +325,8 @@ void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool add
                                   const index_t* ws_dptr) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
-    Shape<ndim> coord = unravel(idx, sshape);
-    index_t j = ravel(coord, bshape);
+    Shape<ndim> coord = mxnet_op::unravel(idx, sshape);
+    index_t j = mxnet_op::ravel(coord, bshape);
     DType val, residual;
     Reducer::SetInitValue(val, residual);
     for (size_t k = 0; k < M; ++k) {
@@ -278,8 +383,8 @@ void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   size_t N = small.shape_.Size(), M = rshape.Size();
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (index_t k = 0; k < static_cast<index_t>(M); k++) {
-    Shape<ndim> coord = unravel(k, rshape);
-    ws_dptr[k] = dot(coord, rstride);
+    Shape<ndim> coord = mxnet_op::unravel(k, rshape);
+    ws_dptr[k] = mxnet_op::dot(coord, rstride);
   }
 
   seq_reduce_compute_extra_mem<Reducer, ndim, DType, OP>(
@@ -310,21 +415,21 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const
                                        const Shape<ndim>& lhs_shape, const Shape<ndim>& rhs_shape,
                                        const Shape<ndim>& rstride, const Shape<ndim>& lhs_stride,
                                        const Shape<ndim>& rhs_stride) {
-  Shape<ndim> coord = unravel(idx, small_shape);
-  const index_t idx_big0 = ravel(coord, big_shape);
-  const index_t idx_lhs0 = ravel(coord, lhs_shape0);
-  const index_t idx_rhs0 = ravel(coord, rhs_shape0);
+  Shape<ndim> coord = mxnet_op::unravel(idx, small_shape);
+  const index_t idx_big0 = mxnet_op::ravel(coord, big_shape);
+  const index_t idx_lhs0 = mxnet_op::ravel(coord, lhs_shape0);
+  const index_t idx_rhs0 = mxnet_op::ravel(coord, rhs_shape0);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
   for (size_t k = 0; k < M; ++k) {
-    Shape<ndim> coord_big = unravel(k, rshape);
-    index_t idx_big = idx_big0 + dot(coord_big, rstride);
+    Shape<ndim> coord_big = mxnet_op::unravel(k, rshape);
+    index_t idx_big = idx_big0 + mxnet_op::dot(coord_big, rstride);
 
-    Shape<ndim> coord_lhs = unravel(k, lhs_shape);
-    index_t idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride);
+    Shape<ndim> coord_lhs = mxnet_op::unravel(k, lhs_shape);
+    index_t idx_lhs = idx_lhs0 + mxnet_op::dot(coord_lhs, lhs_stride);
 
-    Shape<ndim> coord_rhs = unravel(k, rhs_shape);
-    index_t idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
+    Shape<ndim> coord_rhs = mxnet_op::unravel(k, rhs_shape);
+    index_t idx_rhs = idx_rhs0 + mxnet_op::dot(coord_rhs, rhs_stride);
 
     Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual);
   }
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 12af331eefb0..ab187f6d54f7 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -1500,7 +1500,7 @@ struct pick {
                                   const IType *idx, index_t M, int stride,
                                   mshadow::Shape<ndim> bshape,
                                   mshadow::Shape<ndim> sshape) {
-    using namespace broadcast;
+    using namespace mxnet_op;
     index_t j = static_cast<index_t>(idx[i]);
     if (clip) {
       if (j <= 0) j = 0;
@@ -1522,7 +1522,7 @@ struct pick_grad {
                                   const IType *idx, index_t M, int stride,
                                   mshadow::Shape<ndim> bshape,
                                   mshadow::Shape<ndim> sshape) {
-    using namespace broadcast;
+    using namespace mxnet_op;
     index_t j = static_cast<index_t>(idx[i]);
     if (clip) {
       if (j <= 0) j = 0;
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index ca83bdb01e37..a47ff4cedff6 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -183,174 +183,10 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet:
   } else {
     LOG(FATAL) << "Too many broadcast dimensions with operands " << lshape << " " << rshape;
   }
+
   return j;
 }
 
-namespace mxnet_op {
-template<int ndim, typename OP>
-struct binary_broadcast_kernel {
-  /*! \brief Map function for binary_broadcast_kernel */
-  template<typename IType, typename DType>
-  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
-                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
-                                  const Shape <ndim> &oshape, IType *lhs, IType *rhs,
-                                  DType *out) {
-    Shape <ndim> coord = unravel(base, oshape);
-    auto lidx = static_cast<index_t>(dot(coord, lstride));
-    auto ridx = static_cast<index_t>(dot(coord, rstride));
-    KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
-    // starts from 1 to avoid extra inc at end of loop
-    for (index_t i = 1; i < length; ++i) {
-      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
-      // When tuning, don't actually run the op, since it's not going to be tuned against
-      // the actual op we'll eventually be using
-      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
-    }
-  }
-
-  /*! \brief Map function for binary_broadcast_kernel */
-  template<typename LType, typename RType, typename OType>
-  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
-                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
-                                  const Shape <ndim> &oshape, LType *lhs, RType *rhs,
-                                  OType *out) {
-    Shape <ndim> coord = unravel(base, oshape);
-    auto lidx = static_cast<index_t>(dot(coord, lstride));
-    auto ridx = static_cast<index_t>(dot(coord, rstride));
-    KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
-    // starts from 1 to avoid extra inc at end of loop
-    for (index_t i = 1; i < length; ++i) {
-      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
-      // When tuning, don't actually run the op, since it's not going to be tuned against
-      // the actual op we'll eventually be using
-      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
-    }
-  }
-
-  /*! \brief Map function for binary_broadcast_kernel */
-  template<typename IType, typename DType>
-  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
-                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
-                                  const Shape <ndim> &oshape, IType lhs, IType *rhs,
-                                  DType *out) {
-    Shape <ndim> coord = unravel(base, oshape);
-    auto lidx = static_cast<index_t>(dot(coord, lstride));
-    auto ridx = static_cast<index_t>(dot(coord, rstride));
-    KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx]));
-    // starts from 1 to avoid extra inc at end of loop
-    for (index_t i = 1; i < length; ++i) {
-      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
-      // When tuning, don't actually run the op, since it's not going to be tuned against
-      // the actual op we'll eventually be using
-      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
-    }
-  }
-
-  /*! \brief Map function for binary_broadcast_kernel */
-  /* used for mixed type binary ops */
-  template<typename IType, typename DType,
-           typename std::enable_if<!std::is_same<IType, DType>::value, int>::type = 0>
-  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
-                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
-                                  const Shape <ndim> &oshape, IType *lhs, DType *rhs,
-                                  DType *out) {
-    Shape <ndim> coord = unravel(base, oshape);
-    auto lidx = static_cast<index_t>(dot(coord, lstride));
-    auto ridx = static_cast<index_t>(dot(coord, rstride));
-    KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
-    // starts from 1 to avoid extra inc at end of loop
-    for (index_t i = 1; i < length; ++i) {
-      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
-      // When tuning, don't actually run the op, since it's not going to be tuned against
-      // the actual op we'll eventually be using
-      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
-    }
-  }
-
-  /*! \brief Map function for binary_broadcast_kernel */
-  /* used for mixed type binary ops */
-  template<typename IType, typename DType,
-           typename std::enable_if<!std::is_same<IType, DType>::value &&
-                                   !std::is_pointer<IType>::value, int>::type = 0>
-  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
-                                  const Shape <ndim> &lstride, const Shape <ndim> &rstride,
-                                  const Shape <ndim> &oshape, IType lhs, DType *rhs,
-                                  DType *out) {
-    Shape <ndim> coord = unravel(base, oshape);
-    auto lidx = static_cast<index_t>(dot(coord, lstride));
-    auto ridx = static_cast<index_t>(dot(coord, rstride));
-    KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx]));
-    // starts from 1 to avoid extra inc at end of loop
-    for (index_t i = 1; i < length; ++i) {
-      inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
-      // When tuning, don't actually run the op, since it's not going to be tuned against
-      // the actual op we'll eventually be using
-      KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
-    }
-  }
-};
-
-template<int req, typename OP, bool col_vec>
-struct csr_dns_csr_broadcast_kernel {
-  /*!
-   * \brief Map function for broadcast between csr and 1D vector
-   * \param row          global thread id/assigned row id
-   * \param csr_data     ptr to data buffer of csr matrix
-   * \param csr_indices  ptr to indices buffer of csr matrix
-   * \param csr_indptr   ptr to indptr buffer of csr matrix
-   * \param dns          ptr to data buffer of the dense vector
-   * \param out          ptr to the data buffer of the result csr matrix
-   */
-  template<typename DType, typename CType, typename RType>
-  MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices,
-                                  const RType *csr_indptr, const DType *dns, DType *out) {
-    const nnvm::dim_t curr_row_i = csr_indptr[row];
-    const nnvm::dim_t next_row_i = csr_indptr[row + 1];
-    for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) {
-      KERNEL_ASSIGN(out[iter], req, OP::Map(csr_data[iter],
-                    (col_vec)? dns[row] : dns[csr_indices[iter]]));
-    }
-  }
-
-  /*!
-   * \brief Map function for broadcast between csr and a scalar
-   * \param i           global thread id
-   * \param csr_data    ptr to data buffer of csr matrix
-   * \param scalar_ptr  ptr to data buffer of the scalar tensor, only the 0-th element is used
-   * \param out         ptr to the data buffer of output csr matrix
-   * \param nnz         number of non-zero elements in input csr matrix
-   */
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(index_t i, const DType *csr_data, const DType* scalar_ptr,
-                                  DType *out, const nnvm::dim_t nnz) {
-    const DType scale = scalar_ptr[0];
-    if (i < nnz) {
-      KERNEL_ASSIGN(out[i], req, OP::Map(csr_data[i], scale));
-    }
-  }
-};
-
-template<int req, typename OP, bool reverse = false>
-struct csr_dns_map_kernel {
-  template <typename DType, typename CType, typename RType>
-  MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices,
-                                  const RType *csr_indptr, DType *out, const nnvm::dim_t num_rows,
-                                  const nnvm::dim_t num_cols) {
-    if (row < num_rows) {
-      const nnvm::dim_t curr_row_i = csr_indptr[row];
-      const nnvm::dim_t next_row_i = csr_indptr[row + 1];
-      for (nnvm::dim_t iter = curr_row_i; iter < next_row_i; iter++) {
-        const nnvm::dim_t target = row * num_cols + csr_indices[iter];
-        KERNEL_ASSIGN(out[target], req,
-                      reverse ? OP::Map(out[target], csr_data[iter]) :
-                                OP::Map(csr_data[iter], out[target]));
-      }
-    }
-  }
-};
-
-}  // namespace mxnet_op
-
 template<typename xpu, typename OP>
 void BinaryBroadcastIntCompute(const nnvm::NodeAttrs& attrs,
                                const OpContext& ctx,
@@ -402,12 +238,10 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
     }
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(ndim, NDim, {
-        mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
-        mshadow::Shape<NDim> lstride = mxnet_op::calc_stride(new_lshape.get<NDim>());
-        mshadow::Shape<NDim> rstride = mxnet_op::calc_stride(new_rshape.get<NDim>());
-        mxnet_op::Kernel<mxnet_op::binary_broadcast_kernel<NDim, OP>, xpu>::
-        template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-        inputs[0].dptr<DType>(), inputs[1].dptr<DType>(), outputs[0].dptr<DType>());
+        broadcast::BinaryBroadcastComputeImpl<NDim, DType, OP>(s, req[0],
+                                                               inputs[0].reshape(new_lshape),
+                                                               inputs[1].reshape(new_rshape),
+                                                               outputs[0].reshape(new_oshape));
       });
     });
   }
diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh
new file mode 100644
index 000000000000..0bb9fa636f45
--- /dev/null
+++ b/src/operator/tensor/elemwise_binary_op.cuh
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file elemwise_binary_op.cuh
+ * \brief GPU helpers for elementwise operators
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
+#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
+
+#include <cuda_runtime.h>
+#include "../operator_common.h"
+#include "../../common/cuda_vectorization.cuh"
+
+#include <vector>
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace op {
+
+namespace binary {
+
+using common::cuda::VectorizedKernelLauncher;
+using common::cuda::VectorizedLoader;
+using common::cuda::VectorizedStorer;
+
+template <typename DType, int NumInputs, int NumOutputs>
+struct VectorizedBinaryKernelParams {
+  const DType* inputs[NumInputs];
+  DType* outputs[NumOutputs];
+};
+
+template <bool aligned, typename DType, typename LType, typename OP, int req>
+__global__ void VectorizedBinaryKernelFwd(const VectorizedBinaryKernelParams<DType, 2, 1> params,
+                                          const index_t N) {
+  VectorizedLoader<DType, LType, aligned> loader0(params.inputs[0], N);
+  VectorizedLoader<DType, LType, aligned> loader1(params.inputs[1], N);
+  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
+
+  const index_t M = loader0.num_aligned_elements();
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader0.load(tid, N);
+    loader1.load(tid, N);
+    if (req == kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < loader0.nvec(); ++i) {
+      DType temp = OP::Map(loader0.separate()[i],
+                           loader1.separate()[i]);
+
+      if (req == kAddTo) {
+        storer.separate()[i] += temp;
+      } else {
+        storer.separate()[i] = temp;
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+template <bool aligned, typename DType, typename LType,
+          typename LOP, typename ROP, int lreq, int rreq>
+__global__ void VectorizedBinaryKernelBwdUseNone(
+    const VectorizedBinaryKernelParams<DType, 1, 2> params,
+    const index_t N) {
+  VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], N);
+  VectorizedStorer<DType, LType, aligned> lstorer(params.outputs[0], N);
+  VectorizedStorer<DType, LType, aligned> rstorer(params.outputs[1], N);
+
+  const index_t M = loader.num_aligned_elements();
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader.load(tid, N);
+    if (lreq == kAddTo) {
+      lstorer.load(tid, N);
+    }
+    if (rreq == kAddTo) {
+      rstorer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < loader.nvec(); ++i) {
+      DType inp = loader.separate()[i];
+      if (!((std::is_same<LOP, mshadow_op::identity>::value && lreq == kWriteInplace) ||
+            lreq == kNullOp)) {
+        DType ltemp = LOP::Map(inp);
+        if (lreq == kAddTo) {
+          lstorer.separate()[i] += ltemp;
+        } else {
+          lstorer.separate()[i] = ltemp;
+        }
+        lstorer.store(tid, N);
+      }
+      if (!((std::is_same<ROP, mshadow_op::identity>::value && rreq == kWriteInplace) ||
+            rreq == kNullOp)) {
+        DType rtemp = ROP::Map(inp);
+
+        if (rreq == kAddTo) {
+          rstorer.separate()[i] += rtemp;
+        } else {
+          rstorer.separate()[i] = rtemp;
+        }
+        rstorer.store(tid, N);
+      }
+    }
+  }
+}
+
+template <bool aligned, typename DType, typename LType,
+          typename LOP, typename ROP, int lreq, int rreq>
+__global__ void VectorizedBinaryKernelBwdUseIn(
+    const VectorizedBinaryKernelParams<DType, 3, 2> params,
+    const index_t N) {
+  VectorizedLoader<DType, LType, aligned> ograd_loader(params.inputs[0], N);
+  VectorizedLoader<DType, LType, aligned> linput_loader(params.inputs[1], N);
+  VectorizedLoader<DType, LType, aligned> rinput_loader(params.inputs[2], N);
+  VectorizedStorer<DType, LType, aligned> lstorer(params.outputs[0], N);
+  VectorizedStorer<DType, LType, aligned> rstorer(params.outputs[1], N);
+
+  const index_t M = ograd_loader.num_aligned_elements();
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    ograd_loader.load(tid, N);
+    linput_loader.load(tid, N);
+    rinput_loader.load(tid, N);
+    if (lreq == kAddTo) {
+      lstorer.load(tid, N);
+    }
+    if (rreq == kAddTo) {
+      rstorer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < ograd_loader.nvec(); ++i) {
+      DType ograd = ograd_loader.separate()[i];
+      DType linput = linput_loader.separate()[i];
+      DType rinput = rinput_loader.separate()[i];
+      if (!(lreq == kNullOp)) {
+        DType ltemp = ograd * LOP::Map(linput, rinput);
+        if (lreq == kAddTo) {
+          lstorer.separate()[i] += ltemp;
+        } else {
+          lstorer.separate()[i] = ltemp;
+        }
+        lstorer.store(tid, N);
+      }
+      if (!(rreq == kNullOp)) {
+        DType rtemp = ograd * ROP::Map(linput, rinput);
+
+        if (rreq == kAddTo) {
+          rstorer.separate()[i] += rtemp;
+        } else {
+          rstorer.separate()[i] = rtemp;
+        }
+        rstorer.store(tid, N);
+      }
+    }
+  }
+}
+
+template <typename DType, typename OP, int req>
+class VectorizedBinaryFwd {
+ public:
+  using ParamType = VectorizedBinaryKernelParams<DType, 2, 1>;
+
+  template <bool aligned, typename LType>
+  static void Launch(const index_t blocks, const index_t threads,
+                     cudaStream_t stream,
+                     const ParamType params, const index_t lead_dim,
+                     const index_t /* other_dim */) {
+    VectorizedBinaryKernelFwd<aligned, DType, LType, OP, req>
+      <<<blocks, threads, 0, stream>>>(params, lead_dim);
+  }
+};
+
+template <typename DType, typename LOP, typename ROP, int lreq, int rreq>
+class VectorizedBinaryBwdUseNone {
+ public:
+  using ParamType = VectorizedBinaryKernelParams<DType, 1, 2>;
+
+  template <bool aligned, typename LType>
+  static void Launch(const index_t blocks, const index_t threads,
+                     cudaStream_t stream,
+                     const ParamType params, const index_t lead_dim,
+                     const index_t /* other_dim */) {
+    VectorizedBinaryKernelBwdUseNone<aligned, DType, LType, LOP, ROP, lreq, rreq>
+      <<<blocks, threads, 0, stream>>>(params, lead_dim);
+  }
+};
+
+template <typename DType, typename LOP, typename ROP, int lreq, int rreq>
+class VectorizedBinaryBwdUseIn {
+ public:
+  using ParamType = VectorizedBinaryKernelParams<DType, 3, 2>;
+
+  template <bool aligned, typename LType>
+  static void Launch(const index_t blocks, const index_t threads,
+                     cudaStream_t stream,
+                     const ParamType params, const index_t lead_dim,
+                     const index_t /* other_dim */) {
+    VectorizedBinaryKernelBwdUseIn<aligned, DType, LType, LOP, ROP, lreq, rreq>
+      <<<blocks, threads, 0, stream>>>(params, lead_dim);
+  }
+};
+
+}  // namespace binary
+
+template<typename OP>
+void ElemwiseBinaryOp::Compute_(const nnvm::NodeAttrs &attrs,
+                                mshadow::Stream<gpu> *s,
+                                const std::vector<TBlob> &inputs,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<TBlob> &outputs) {
+  using namespace binary;
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      using LType = uint4;
+      using Kernel = VectorizedBinaryFwd<DType, OP, Req>;
+
+      const index_t size = outputs[0].Size();
+      typename Kernel::ParamType params;
+      params.inputs[0] = inputs[0].dptr<DType>();
+      params.inputs[1] = inputs[1].dptr<DType>();
+      params.outputs[0] = outputs[0].dptr<DType>();
+
+      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
+    });
+  });
+}
+
+template<typename LOP, typename ROP>
+void ElemwiseBinaryOp::BackwardUseNone_(const nnvm::NodeAttrs &attrs,
+                                        mshadow::Stream<gpu>* s,
+                                        const std::vector<TBlob> &inputs,
+                                        const std::vector<OpReqType> &req,
+                                        const std::vector<TBlob> &outputs) {
+  using namespace binary;
+  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    const index_t size = inputs[0].Size();
+    if (req[0] != kNullOp || req[1] != kNullOp) {
+      MXNET_REQ_TYPE_SWITCH(req[0], lreq, {
+        MXNET_REQ_TYPE_SWITCH(req[1], rreq, {
+          using LType = uint4;
+          using Kernel = VectorizedBinaryBwdUseNone<DType, LOP, ROP, lreq, rreq>;
+
+          typename Kernel::ParamType params;
+          params.inputs[0] = inputs[0].dptr<DType>();
+          params.outputs[0] = outputs[0].dptr<DType>();
+          params.outputs[1] = outputs[1].dptr<DType>();
+
+          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
+        });
+      });
+    }
+  });
+}
+
+template<typename LOP, typename ROP>
+void ElemwiseBinaryOp::BackwardUseIn_(const nnvm::NodeAttrs &attrs,
+                                      mshadow::Stream<gpu>* s,
+                                      const std::vector<TBlob> &inputs,
+                                      const std::vector<OpReqType> &req,
+                                      const std::vector<TBlob> &outputs) {
+  using namespace binary;
+  if (req[0] != kNullOp || req[1] != kNullOp) {
+    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      MXNET_REQ_TYPE_SWITCH(req[0], lreq, {
+        MXNET_REQ_TYPE_SWITCH(req[1], rreq, {
+          const index_t size = inputs[0].Size();
+          // Using 64 bit loads to reduce register pressure
+          using LType = uint2;
+          using Kernel = VectorizedBinaryBwdUseIn<DType, LOP, ROP, lreq, rreq>;
+
+          typename Kernel::ParamType params;
+          params.inputs[0] = inputs[0].dptr<DType>();
+          params.inputs[1] = inputs[1].dptr<DType>();
+          params.inputs[2] = inputs[2].dptr<DType>();
+          params.outputs[0] = outputs[0].dptr<DType>();
+          params.outputs[1] = outputs[1].dptr<DType>();
+
+          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
+        });
+      });
+    });
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 7094e1e7367c..158c46122d31 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -106,62 +106,85 @@ class ElemwiseBinaryOp : public OpBase {
   }
 
  private:
-  template<typename xpu, typename LOP, typename ROP, typename DType>
+  template<typename LOP, typename ROP>
   static void BackwardUseNone_(const nnvm::NodeAttrs &attrs,
-                               const OpContext &ctx,
+                               mshadow::Stream<cpu>* s,
                                const std::vector<TBlob> &inputs,
                                const std::vector<OpReqType> &req,
                                const std::vector<TBlob> &outputs) {
-    using namespace mxnet_op;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const int size = static_cast<int>((outputs[0].Size() + DataType<DType>::kLanes - 1)
-                                      / DataType<DType>::kLanes);
-    const DType *ograd_dptr = inputs[0].dptr<DType>();
-    if (std::is_same<LOP, mshadow_op::identity>::value && req[0] == kWriteInplace) {
-      CHECK_EQ(ograd_dptr, outputs[0].dptr<DType>());
-    } else if (req[0] != kNullOp) {
-      DType *lgrad_dptr = outputs[0].dptr<DType>();
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      using namespace mxnet_op;
+      const int size = static_cast<int>((outputs[0].Size() + DataType<DType>::kLanes - 1)
+                                        / DataType<DType>::kLanes);
+      const DType *ograd_dptr = inputs[0].dptr<DType>();
+      if (std::is_same<LOP, mshadow_op::identity>::value && req[0] == kWriteInplace) {
+        CHECK_EQ(ograd_dptr, outputs[0].dptr<DType>());
+      } else if (req[0] != kNullOp) {
+        DType *lgrad_dptr = outputs[0].dptr<DType>();
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          Kernel<mxnet_op::op_with_req<LOP, Req>, cpu>::Launch(s, size, lgrad_dptr, ograd_dptr);
+        });
+      }
+      if (std::is_same<ROP, mshadow_op::identity>::value && req[1] == kWriteInplace) {
+        CHECK_EQ(ograd_dptr, outputs[1].dptr<DType>());
+      } else if (req[1] != kNullOp) {
+        DType *rgrad_dptr = outputs[1].dptr<DType>();
+        MXNET_ASSIGN_REQ_SWITCH(req[1], Req, {
+          Kernel<mxnet_op::op_with_req<ROP, Req>, cpu>::Launch(s, size, rgrad_dptr, ograd_dptr);
+        });
+      }
+    });
+  }
+#if MXNET_USE_CUDA
+  template<typename LOP, typename ROP>
+  static void BackwardUseNone_(const nnvm::NodeAttrs &attrs,
+                               mshadow::Stream<gpu>* s,
+                               const std::vector<TBlob> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<TBlob> &outputs);
+#endif
+
+  template<typename LOP, typename ROP>
+  static void BackwardUseIn_(const nnvm::NodeAttrs &attrs,
+                             mshadow::Stream<cpu>* s,
+                             const std::vector<TBlob> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<TBlob> &outputs) {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      DCHECK_EQ(outputs.size(), 2U);
+      DCHECK_EQ(inputs.size(), 3U);
+      const DType *ograd_dptr = inputs[0].dptr<DType>();
+      const DType *lhs_dptr = inputs[1].dptr<DType>();
+      const DType *rhs_dptr = inputs[2].dptr<DType>();
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        Kernel<mxnet_op::op_with_req<LOP, Req>, xpu>::Launch(s, size, lgrad_dptr, ograd_dptr);
+        const int size = static_cast<int>(
+          (outputs[0].Size() + mxnet_op::DataType<DType>::kLanes - 1)
+          / mxnet_op::DataType<DType>::kLanes);
+        DType * lgrad_dptr = outputs[0].dptr<DType>();
+        mxnet_op::Kernel<
+          mxnet_op::op_with_req<mxnet_op::backward_grad_tuned<LOP>, Req>, cpu>::Launch(
+            s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);
       });
-    }
-    if (std::is_same<ROP, mshadow_op::identity>::value && req[1] == kWriteInplace) {
-      CHECK_EQ(ograd_dptr, outputs[1].dptr<DType>());
-    } else if (req[1] != kNullOp) {
-      DType *rgrad_dptr = outputs[1].dptr<DType>();
       MXNET_ASSIGN_REQ_SWITCH(req[1], Req, {
-        Kernel<mxnet_op::op_with_req<ROP, Req>, xpu>::Launch(s, size, rgrad_dptr, ograd_dptr);
+        const int size = static_cast<int>(
+          (outputs[1].Size() + mxnet_op::DataType<DType>::kLanes - 1)
+          / mxnet_op::DataType<DType>::kLanes);
+        DType * rgrad_dptr = outputs[1].dptr<DType>();
+        mxnet_op::Kernel<
+          mxnet_op::op_with_req<mxnet_op::backward_grad_tuned<ROP>, Req>, cpu>::Launch(
+            s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);
       });
-    }
+    });
   }
 
-  template<typename xpu, typename LOP, typename ROP, typename DType>
+#if MXNET_USE_CUDA
+  template<typename LOP, typename ROP>
   static void BackwardUseIn_(const nnvm::NodeAttrs &attrs,
-                             const OpContext &ctx,
+                             mshadow::Stream<gpu>* s,
                              const std::vector<TBlob> &inputs,
                              const std::vector<OpReqType> &req,
-                             const std::vector<TBlob> &outputs) {
-    DCHECK_EQ(outputs.size(), 2U);
-    DCHECK_EQ(inputs.size(), 3U);
-    mxnet_op::Stream<xpu> *s = ctx.get_stream<xpu>();
-    const DType *ograd_dptr = inputs[0].dptr<DType>();
-    const DType *lhs_dptr = inputs[1].dptr<DType>();
-    const DType *rhs_dptr = inputs[2].dptr<DType>();
-    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-      const int size = static_cast<int>(
-        (outputs[0].Size() + mxnet_op::DataType<DType>::kLanes - 1)
-        / mxnet_op::DataType<DType>::kLanes);
-      DType * lgrad_dptr = outputs[0].dptr<DType>();
-      mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::backward_grad_tuned<LOP>, Req>, xpu>::Launch(
-        s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);});
-    MXNET_ASSIGN_REQ_SWITCH(req[1], Req, {
-      const int size = static_cast<int>(
-        (outputs[1].Size() + mxnet_op::DataType<DType>::kLanes - 1)
-        / mxnet_op::DataType<DType>::kLanes);
-      DType * rgrad_dptr = outputs[1].dptr<DType>();
-      mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::backward_grad_tuned<ROP>, Req>, xpu>::Launch(
-        s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);});
-  }
+                             const std::vector<TBlob> &outputs);
+#endif
 
   template<
     typename xpu,
@@ -471,15 +494,13 @@ class ElemwiseBinaryOp : public OpBase {
     });
   }
 
-  template<typename xpu, typename OP>
-  static void Compute(const nnvm::NodeAttrs &attrs,
-                      const OpContext &ctx,
-                      const std::vector<TBlob> &inputs,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<TBlob> &outputs) {
+  template<typename OP>
+  static void Compute_(const nnvm::NodeAttrs &attrs,
+                       mshadow::Stream<cpu> *s,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
     using namespace mxnet_op;
-    if (req[0] == kNullOp) return;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
     CHECK_EQ(inputs.size(), 2U);
     CHECK_EQ(outputs.size(), 1U);
     if (outputs[0].type_flag_ == mshadow::kBool) {
@@ -490,7 +511,7 @@ class ElemwiseBinaryOp : public OpBase {
         const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
         + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
         if (size != 0) {
-          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+          Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(s, size,
           outputs[0].dptr<DType>(),
           inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
         }
@@ -556,6 +577,26 @@ template<typename xpu, typename OP>
     });
   }
 
+#if MXNET_USE_CUDA
+  template<typename OP>
+  static void Compute_(const nnvm::NodeAttrs &attrs,
+                       mshadow::Stream<gpu> *s,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs);
+#endif
+
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs &attrs,
+                      const OpContext &ctx,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &outputs) {
+    if (req[0] == kNullOp) return;
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    Compute_<OP>(attrs, s, inputs, req, outputs);
+  }
+
   template<typename xpu, typename OP>
   static void ComputeWithBool(const nnvm::NodeAttrs &attrs,
                               const OpContext &ctx,
@@ -607,30 +648,6 @@ template<typename xpu, typename OP>
     });
   }
 
-  template<typename xpu, typename OP>
-  static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
-                               const OpContext &ctx,
-                               const std::vector<TBlob> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<TBlob> &outputs) {
-    using namespace mxnet_op;
-    if (req[0] == kNullOp) return;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    CHECK_EQ(inputs.size(), 2U);
-    CHECK_EQ(outputs.size(), 1U);
-    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-      MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
-        const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
-        + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
-        if (size != 0) {
-          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
-          outputs[0].dptr<DType>(),
-          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
-        }
-      });
-    });
-  }
-
   template<typename xpu, typename OP>
   static void ComputeEx(const nnvm::NodeAttrs &attrs,
                         const OpContext &ctx,
@@ -727,20 +744,8 @@ template<typename xpu, typename OP>
                                      const std::vector<TBlob> &inputs,
                                      const std::vector<OpReqType> &req,
                                      const std::vector<TBlob> &outputs) {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      BackwardUseNone_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
-    });
-  }
-
-  template<typename xpu, typename LOP, typename ROP>
-  static inline void BackwardUseNoneWithHalf2(const nnvm::NodeAttrs &attrs,
-                                              const OpContext &ctx,
-                                              const std::vector<TBlob> &inputs,
-                                              const std::vector<OpReqType> &req,
-                                              const std::vector<TBlob> &outputs) {
-    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
-      BackwardUseNone_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
-    });
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    BackwardUseNone_<LOP, ROP>(attrs, s, inputs, req, outputs);
   }
 
   template<typename xpu, typename LOP, typename ROP>
@@ -784,21 +789,10 @@ template<typename xpu, typename OP>
                                    const std::vector<TBlob> &inputs,
                                    const std::vector<OpReqType> &req,
                                    const std::vector<TBlob> &outputs) {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      BackwardUseIn_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
-    });
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    BackwardUseIn_<LOP, ROP>(attrs, s, inputs, req, outputs);
   }
 
-  template<typename xpu, typename LOP, typename ROP>
-  static inline void BackwardUseInWithHalf2(const nnvm::NodeAttrs &attrs,
-                                            const OpContext &ctx,
-                                            const std::vector<TBlob> &inputs,
-                                            const std::vector<OpReqType> &req,
-                                            const std::vector<TBlob> &outputs) {
-    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
-      BackwardUseIn_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
-    });
-  }
 
   template<typename xpu, typename LOP, typename ROP>
   static inline void BackwardUseInEx(const nnvm::NodeAttrs &attrs,
@@ -890,4 +884,9 @@ template<typename xpu, typename OP>
 
 }  // namespace op
 }  // namespace mxnet
+
+#ifdef __CUDACC__
+#include "elemwise_binary_op.cuh"
+#endif  // __CUDACC__
+
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_H_
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index 16d7fc1ad72b..b21b08d03217 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -218,52 +218,51 @@ void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream<gpu> *s,
 }
 
 NNVM_REGISTER_OP(elemwise_add)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::plus>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::plus>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseBinaryOp::ComputeEx<gpu, op::mshadow_op::plus>);
 
 NNVM_REGISTER_OP(_grad_add)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::plus>);
 
 NNVM_REGISTER_OP(_backward_add)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseNoneWithHalf2<gpu, mshadow_op::identity,
+                    ElemwiseBinaryOp::BackwardUseNone<gpu, mshadow_op::identity,
                     mshadow_op::identity>);
 
 NNVM_REGISTER_OP(elemwise_sub)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<
-  gpu, op::mshadow_op::minus>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::minus>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseBinaryOp::ComputeEx<gpu, op::mshadow_op::minus>);
 
 NNVM_REGISTER_OP(_backward_sub)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseNoneWithHalf2<gpu, mshadow_op::identity,
+                    ElemwiseBinaryOp::BackwardUseNone<gpu, mshadow_op::identity,
                     mshadow_op::negation>);
 
 NNVM_REGISTER_OP(elemwise_mul)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::mul>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::mul>)
 .set_attr<FComputeEx>("FComputeEx<gpu>",
   ElemwiseBinaryOp::ComputeDnsLRValueEx<gpu, op::mshadow_op::mul, true, true>);
 
 NNVM_REGISTER_OP(_backward_mul)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::right,
+                    ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::right,
                     mshadow_op::left>);
 
 NNVM_REGISTER_OP(elemwise_div)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::div>);
+                    ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::div>);
 
 NNVM_REGISTER_OP(_backward_div)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::div_grad,
+                    ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::div_grad,
                     mshadow_op::div_rgrad>);
 
 NNVM_REGISTER_OP(_mod)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, mshadow_op::mod>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::mod>);
 
 NNVM_REGISTER_OP(_backward_mod)
 .set_attr<FCompute>("FCompute<gpu>",
-  ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::mod_grad, mshadow_op::mod_rgrad>);
+  ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::mod_grad, mshadow_op::mod_rgrad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh
new file mode 100644
index 000000000000..bbe606777dcc
--- /dev/null
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file elemwise_binary_scalar_op.cuh
+ * \brief GPU helpers for binary elementwise operators with scalar
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
+#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
+
+#include <cuda_runtime.h>
+#include "../operator_common.h"
+#include "../../common/cuda_vectorization.cuh"
+
+#include <vector>
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace op {
+
+namespace binary_scalar {
+
+using common::cuda::VectorizedKernelLauncher;
+using common::cuda::VectorizedLoader;
+using common::cuda::VectorizedStorer;
+
+template <typename DType, int NumInputs, int NumOutputs>
+struct VectorizedKernelParams {
+  const DType* inputs[NumInputs];
+  DType* outputs[NumOutputs];
+  DType scalar;
+};
+
+template <bool aligned, typename DType, typename LType, typename OP, int req>
+__global__ void VectorizedBinaryScalarKernelFwd(const VectorizedKernelParams<DType, 1, 1> params,
+                                                const index_t N) {
+  VectorizedLoader<DType, LType, aligned> loader0(params.inputs[0], N);
+  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
+
+  const index_t M = loader0.num_aligned_elements();
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader0.load(tid, N);
+    if (req == kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < loader0.nvec(); ++i) {
+      DType temp = OP::Map(loader0.separate()[i],
+                           params.scalar);
+
+      if (req == kAddTo) {
+        storer.separate()[i] += temp;
+      } else {
+        storer.separate()[i] = temp;
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+template <bool aligned, typename DType, typename LType, typename OP, int req>
+__global__ void VectorizedBinaryScalarKernelBwd(const VectorizedKernelParams<DType, 2, 1> params,
+                                                const index_t N) {
+  VectorizedLoader<DType, LType, aligned> ograd_loader(params.inputs[0], N);
+  VectorizedLoader<DType, LType, aligned> input_loader(params.inputs[1], N);
+  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
+
+  const index_t M = ograd_loader.num_aligned_elements();
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    ograd_loader.load(tid, N);
+    input_loader.load(tid, N);
+    if (req == kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < ograd_loader.nvec(); ++i) {
+      DType ograd = ograd_loader.separate()[i];
+      DType temp = ograd * OP::Map(input_loader.separate()[i],
+                                   params.scalar);
+
+      if (req == kAddTo) {
+        storer.separate()[i] += temp;
+      } else {
+        storer.separate()[i] = temp;
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+template <typename DType, typename OP, int req>
+class VectorizedBinaryScalarFwd {
+ public:
+  using ParamType = VectorizedKernelParams<DType, 1, 1>;
+
+  template <bool aligned, typename LType>
+  static void Launch(const index_t blocks, const index_t threads,
+                     cudaStream_t stream,
+                     const ParamType params, const index_t lead_dim,
+                     const index_t /* other_dim */) {
+    VectorizedBinaryScalarKernelFwd<aligned, DType, LType, OP, req>
+      <<<blocks, threads, 0, stream>>>(params, lead_dim);
+  }
+};
+
+template <typename DType, typename OP, int req>
+class VectorizedBinaryScalarBwd {
+ public:
+  using ParamType = VectorizedKernelParams<DType, 2, 1>;
+
+  template <bool aligned, typename LType>
+  static void Launch(const index_t blocks, const index_t threads,
+                     cudaStream_t stream,
+                     const ParamType params, const index_t lead_dim,
+                     const index_t /* other_dim */) {
+    VectorizedBinaryScalarKernelBwd<aligned, DType, LType, OP, req>
+      <<<blocks, threads, 0, stream>>>(params, lead_dim);
+  }
+};
+
+}  // namespace binary_scalar
+
+template <typename OP>
+void BinaryScalarOp::Compute_(const nnvm::NodeAttrs &attrs,
+                              const OpContext &ctx,
+                              mshadow::Stream<gpu>* s,
+                              const std::vector<TBlob> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  using namespace binary_scalar;
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
+  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      using LType = uint4;
+      using Kernel = VectorizedBinaryScalarFwd<DType, OP, Req>;
+
+      const index_t size = outputs[0].Size();
+      typename Kernel::ParamType params;
+      params.inputs[0] = inputs[0].dptr<DType>();
+      params.outputs[0] = outputs[0].dptr<DType>();
+      params.scalar = (DType)alpha;
+
+      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
+    });
+  });
+}
+
+template <typename OP>
+void BinaryScalarOp::Backward_(const nnvm::NodeAttrs &attrs,
+                               mshadow::Stream<gpu>* s,
+                               const std::vector<TBlob> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<TBlob> &outputs) {
+  using namespace binary_scalar;
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
+  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      using LType = uint4;
+      using Kernel = VectorizedBinaryScalarBwd<DType, OP, Req>;
+
+      const index_t size = outputs[0].Size();
+      typename Kernel::ParamType params;
+      params.inputs[0] = inputs[0].dptr<DType>();
+      params.inputs[1] = inputs[1].dptr<DType>();
+      params.outputs[0] = outputs[0].dptr<DType>();
+      params.scalar = (DType)alpha;
+
+      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index c09e41867f46..568daee2f589 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -266,17 +266,17 @@ class BinaryScalarOp : public UnaryOp {
   }
 
  public:
-  template<typename xpu, typename OP>
-  static void Compute(const nnvm::NodeAttrs &attrs,
-                      const OpContext &ctx,
-                      const std::vector<TBlob> &inputs,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<TBlob> &outputs) {
+  template<typename OP>
+  static void Compute_(const nnvm::NodeAttrs &attrs,
+                       const OpContext &ctx,
+                       mshadow::Stream<cpu>* s,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
     DCHECK_EQ(inputs.size(), 1);
     DCHECK_EQ(outputs.size(), 1);
     using namespace mshadow;
     using namespace mshadow::expr;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
     TBlob temp_tblob;
     const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
     bool scalar_is_int = param.is_int;
@@ -284,20 +284,40 @@ class BinaryScalarOp : public UnaryOp {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       if ((common::is_int(inputs[0].type_flag_) && !scalar_is_int) ||
           (inputs[0].type_flag_ == kBool)) {
-        Tensor<xpu, 1, DType> temp_tensor =
-            ctx.requested[0].get_space_typed<xpu, 1, DType>(Shape1(inputs[0].Size()), s);
+        Tensor<cpu, 1, DType> temp_tensor =
+            ctx.requested[0].get_space_typed<cpu, 1, DType>(Shape1(inputs[0].Size()), s);
         temp_tblob = TBlob(temp_tensor);
-        CastCompute<xpu>(attrs, ctx, {inputs[0]}, {kWriteTo}, {temp_tblob});
+        CastCompute<cpu>(attrs, ctx, {inputs[0]}, {kWriteTo}, {temp_tblob});
       } else {
         temp_tblob = inputs[0];
       }
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(
           s, inputs[0].Size(), outputs[0].dptr<DType>(), temp_tblob.dptr<DType>(), DType(alpha));
       });
     });
   }
 
+#if MXNET_USE_CUDA
+  template<typename OP>
+  static void Compute_(const nnvm::NodeAttrs &attrs,
+                       const OpContext &ctx,
+                       mshadow::Stream<gpu>* s,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs);
+#endif
+
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs &attrs,
+                      const OpContext &ctx,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &outputs) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    Compute_<OP>(attrs, ctx, s, inputs, req, outputs);
+  }
+
   template<typename xpu, typename OP>
   static void ComputeInt(const nnvm::NodeAttrs &attrs,
                          const OpContext &ctx,
@@ -401,27 +421,47 @@ class BinaryScalarOp : public UnaryOp {
     }
   }
 
-  template<typename xpu, typename OP>
-  static void Backward(const nnvm::NodeAttrs &attrs,
-                       const OpContext &ctx,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs) {
+  template<typename OP>
+  static void Backward_(const nnvm::NodeAttrs &attrs,
+                        mshadow::Stream<cpu>* s,
+                        const std::vector<TBlob> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &outputs) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
     const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
     const double alpha = param.scalar;
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet::op::mxnet_op::Kernel<mxnet::op::mxnet_op::op_with_req<
-          mxnet::op::mxnet_op::backward_grad_tuned<OP>, Req>, xpu>::
+          mxnet::op::mxnet_op::backward_grad_tuned<OP>, Req>, cpu>::
           Launch(s, inputs[0].Size(), outputs[0].dptr<DType>(),
                  inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
                  DType(alpha));
       });
     });
   }
+
+#if MXNET_USE_CUDA
+  template<typename OP>
+  static void Backward_(const nnvm::NodeAttrs &attrs,
+                        mshadow::Stream<gpu>* s,
+                        const std::vector<TBlob> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &outputs);
+#endif
+
+  template<typename xpu, typename OP>
+  static void Backward(const nnvm::NodeAttrs &attrs,
+                       const OpContext &ctx,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Backward_<OP>(attrs, s, inputs, req, outputs);
+  }
 };
 
 #define MXNET_OPERATOR_REGISTER_BINARY_SCALAR(name)                       \
@@ -444,4 +484,9 @@ class BinaryScalarOp : public UnaryOp {
 
 }  // namespace op
 }  // namespace mxnet
+
+#ifdef __CUDACC__
+#include "elemwise_binary_scalar_op.cuh"
+#endif
+
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_H_
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
index 3c839205683a..3fd017f09ec7 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
@@ -57,22 +57,19 @@ NNVM_REGISTER_OP(_rdiv_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rdiv>);
 
 NNVM_REGISTER_OP(_backward_rdiv_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu,
-  mshadow_op::rdiv_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rdiv_grad>);
 
 NNVM_REGISTER_OP(_mod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
 
 NNVM_REGISTER_OP(_backward_mod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
-  gpu, mshadow_op::mod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::mod_grad>);
 
 NNVM_REGISTER_OP(_rmod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
 
 NNVM_REGISTER_OP(_backward_rmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
-  gpu, mshadow_op::rmod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rmod_grad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
index 2bd52d7b9d7c..f09e40a2eee7 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
@@ -44,30 +44,25 @@ NNVM_REGISTER_OP(_power_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
 
 NNVM_REGISTER_OP(_backward_power_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
-  gpu, mshadow_op::power_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::power_grad>);
 
 NNVM_REGISTER_OP(_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
 
 NNVM_REGISTER_OP(_backward_rpower_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
-  gpu, mshadow_op::rpower_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rpower_grad>);
 
 NNVM_REGISTER_OP(_hypot_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::hypot>);
 
 NNVM_REGISTER_OP(_backward_hypot_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
-  gpu, mshadow_op::hypot_grad_left>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::hypot_grad_left>);
 
 NNVM_REGISTER_OP(smooth_l1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<
-  gpu, mshadow_op::smooth_l1_loss>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::smooth_l1_loss>);
 
 NNVM_REGISTER_OP(_backward_smooth_l1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
-  gpu, mshadow_op::smooth_l1_gradient>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::smooth_l1_gradient>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index f9a248214e85..352c74ea9445 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -24,10 +24,118 @@
 */
 #include "./elemwise_sum.h"
 #include "../../ndarray/ndarray_function.h"
+#include "../../common/cuda_vectorization.cuh"
 
 namespace mxnet {
 namespace op {
 
+using common::cuda::VectorizedKernelLauncher;
+using common::cuda::VectorizedLoader;
+using common::cuda::VectorizedStorer;
+
+namespace {
+
+constexpr size_t num_inputs_per_kernel = 4;
+
+template <typename DType, int NumInputs>
+struct VectorizedElementwiseSumKernelParams {
+  int num_inputs;
+  const DType* inputs[NumInputs];
+  DType* outputs[1];
+};
+
+template <bool aligned, typename DType, typename LType, int req>
+__launch_bounds__(mxnet::common::cuda::vectorized_kernel_thread_num)
+__global__ void VectorizedElementwiseSumKernel(
+    const VectorizedElementwiseSumKernelParams<DType, num_inputs_per_kernel> params,
+    const index_t N) {
+  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
+
+  const index_t M = storer.num_aligned_elements();
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+      tid < M;
+      tid += gridDim.x * blockDim.x) {
+    if (req == kAddTo) {
+      storer.load(tid, N);
+    } else {
+#pragma unroll
+      for (int i = 0; i < storer.nvec(); ++i) {
+        storer.separate()[i] = 0;
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < num_inputs_per_kernel; ++i) {
+      if (i < params.num_inputs) {
+        VectorizedLoader<DType, LType, aligned> loader(params.inputs[i], N);
+        loader.load(tid, N);
+#pragma unroll
+        for (int i = 0; i < loader.nvec(); ++i) {
+          storer.separate()[i] += loader.separate()[i];
+        }
+      }
+    }
+
+    storer.store(tid, N);
+  }
+}
+
+
+template <typename DType, int req>
+class VectorizedElementwiseSumFwd {
+ public:
+  using ParamType = VectorizedElementwiseSumKernelParams<DType, num_inputs_per_kernel>;
+
+  template <bool aligned, typename LType>
+  static void Launch(const index_t blocks, const index_t threads,
+                     cudaStream_t stream,
+                     const ParamType params, const index_t lead_dim,
+                     const index_t /* other_dim */) {
+    VectorizedElementwiseSumKernel<aligned, DType, LType, req>
+      <<<blocks, threads, 0, stream>>>(params, lead_dim);
+  }
+};
+
+void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
+                              const OpContext &ctx,
+                              const std::vector<TBlob> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(outputs.size(), 1U);
+  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      using LType = uint2;
+      const index_t size = inputs[0].Size();
+      for (size_t i = 0; i < inputs.size(); i += num_inputs_per_kernel) {
+        if (i == 0) {
+          using Kernel = VectorizedElementwiseSumFwd<DType, Req>;
+          typename Kernel::ParamType params;
+          params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
+          for (int j = 0; j < params.num_inputs; ++j) {
+            params.inputs[j] = inputs[i + j].dptr<DType>();
+          }
+          params.outputs[0] = outputs[0].dptr<DType>();
+          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
+        } else {
+          /* During subsequent launches we need to
+             accumulate into the previous outputs
+          */
+          using Kernel = VectorizedElementwiseSumFwd<DType, kAddTo>;
+          typename Kernel::ParamType params;
+          params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
+          for (int j = 0; j < params.num_inputs; ++j) {
+            params.inputs[j] = inputs[i + j].dptr<DType>();
+          }
+          params.outputs[0] = outputs[0].dptr<DType>();
+          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
+        }
+      }
+    });
+  });
+}
+
 void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
@@ -51,8 +159,10 @@ void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs,
   }
 }
 
+}  // namespace
+
 NNVM_REGISTER_OP(add_n)
-.set_attr<FCompute>("FCompute<gpu>", ElementWiseSumComputeWithHalf2<gpu>)
+.set_attr<FCompute>("FCompute<gpu>", VectorizedElementwiseSum)
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElementWiseSumComputeExGPU);
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_sum.h b/src/operator/tensor/elemwise_sum.h
index 259c80ddddac..d40ab4de0f0f 100644
--- a/src/operator/tensor/elemwise_sum.h
+++ b/src/operator/tensor/elemwise_sum.h
@@ -113,18 +113,6 @@ void ElementWiseSumCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
-template<typename xpu>
-void ElementWiseSumComputeWithHalf2(const nnvm::NodeAttrs& attrs,
-                                    const OpContext& ctx,
-                                    const std::vector<TBlob>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<TBlob>& outputs) {
-  CHECK_EQ(outputs.size(), 1U);
-  MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
-      ElementWiseSumCompute_<xpu, DType>(attrs, ctx, inputs, req, outputs);
-  });
-}
-
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_SUM_H_
diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh
new file mode 100644
index 000000000000..8688a8b8ac66
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op.cuh
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file elemwise_unary_op.cuh
+ * \brief GPU helpers for unary elementwise operators
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
+#define MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
+
+#include <cuda_runtime.h>
+#include "../operator_common.h"
+#include "../../common/cuda_vectorization.cuh"
+
+#include <vector>
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace op {
+
+namespace unary {
+
+using common::cuda::VectorizedKernelLauncher;
+using common::cuda::VectorizedLoader;
+using common::cuda::VectorizedStorer;
+
+template <typename DType, int NumInputs, int NumOutputs>
+struct VectorizedKernelParams {
+  const DType* inputs[NumInputs];
+  DType* outputs[NumOutputs];
+};
+
+template <bool aligned, typename DType, typename LType, typename OP, int req>
+__global__ void VectorizedUnaryScalarKernelFwd(const VectorizedKernelParams<DType, 1, 1> params,
+                                               const index_t N) {
+  VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], N);
+  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
+
+  const index_t M = loader.num_aligned_elements();
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader.load(tid, N);
+    if (req == kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < loader.nvec(); ++i) {
+      DType temp = OP::Map(loader.separate()[i]);
+
+      if (req == kAddTo) {
+        storer.separate()[i] += temp;
+      } else {
+        storer.separate()[i] = temp;
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+template <typename DType, typename OP, int req>
+class VectorizedUnaryScalarFwd {
+ public:
+  using ParamType = VectorizedKernelParams<DType, 1, 1>;
+
+  template <bool aligned, typename LType>
+  static void Launch(const index_t blocks, const index_t threads,
+                     cudaStream_t stream,
+                     const ParamType params, const index_t lead_dim,
+                     const index_t /* other_dim */) {
+    VectorizedUnaryScalarKernelFwd<aligned, DType, LType, OP, req>
+      <<<blocks, threads, 0, stream>>>(params, lead_dim);
+  }
+};
+
+}  // namespace unary
+
+template<typename OP>
+void UnaryOp::Compute_(const nnvm::NodeAttrs& attrs,
+                     mshadow::Stream<gpu>* s,
+                     const std::vector<TBlob>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<TBlob>& outputs) {
+  using namespace unary;
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      using LType = uint4;
+      using Kernel = VectorizedUnaryScalarFwd<DType, OP, Req>;
+
+      const index_t size = outputs[0].Size();
+      typename Kernel::ParamType params;
+      params.inputs[0] = inputs[0].dptr<DType>();
+      params.outputs[0] = outputs[0].dptr<DType>();
+
+      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index f2148b559c0c..8067491ded2a 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -236,23 +236,42 @@ class UnaryOp : public OpBase {
     }
   }
 
-  template<typename xpu, typename OP>
-  static void Compute(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<TBlob>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  template<typename OP>
+  static void Compute_(const nnvm::NodeAttrs& attrs,
+                       mshadow::Stream<cpu>* s,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         if (inputs[0].Size() != 0) {
-          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(
             s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
         }
       });
     });
   }
 
+#if MXNET_USE_CUDA
+  template<typename OP>
+  static void Compute_(const nnvm::NodeAttrs& attrs,
+                       mshadow::Stream<gpu>* s,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs);
+
+#endif
+
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    Compute_<OP>(attrs, s, inputs, req, outputs);
+  }
+
   template<typename xpu, typename OP>
   static void ComputeMixedType(const nnvm::NodeAttrs& attrs,
                                const OpContext& ctx,
@@ -369,23 +388,6 @@ class UnaryOp : public OpBase {
   }
 #endif
 
-  template<typename xpu, typename op>
-  static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
-                               const OpContext &ctx,
-                               const std::vector<TBlob> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<TBlob> &outputs) {
-    using namespace mshadow;
-    using namespace mxnet_op;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    CHECK_EQ(inputs.size(), 1U);
-    CHECK_EQ(outputs.size(), 1U);
-    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
-      Kernel<op, xpu>::Launch(s, outputs[0].Size(),
-                              outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
-    });
-  }
-
   template<typename xpu>
   static void IdentityCompute(const nnvm::NodeAttrs& attrs,
                               const OpContext& ctx,
@@ -918,4 +920,8 @@ void NumpyNanToNumOpBackward(const nnvm::NodeAttrs& attrs,
 }  // namespace op
 }  // namespace mxnet
 
+#ifdef __CUDACC__
+#include "elemwise_unary_op.cuh"
+#endif
+
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index 8fef6f5f7b38..6bbba7411851 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -22,6 +22,7 @@
  * \brief GPU Implementation of unary functions.
  */
 #include "./elemwise_binary_op.h"
+#include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/elemwise_unary_op_pow.cu b/src/operator/tensor/elemwise_unary_op_pow.cu
index 4dbdf349cdb0..287a2e87be73 100644
--- a/src/operator/tensor/elemwise_unary_op_pow.cu
+++ b/src/operator/tensor/elemwise_unary_op_pow.cu
@@ -22,6 +22,7 @@
  * \brief GPU Implementation of power (x^k for fixed k) functions.
  */
 #include "./elemwise_binary_op.h"
+#include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/elemwise_unary_op_trig.cu b/src/operator/tensor/elemwise_unary_op_trig.cu
index 8e28b9c609fa..f5e9d1ccbd6c 100644
--- a/src/operator/tensor/elemwise_unary_op_trig.cu
+++ b/src/operator/tensor/elemwise_unary_op_trig.cu
@@ -22,6 +22,7 @@
  * \brief GPU Implementation of unary trigonometric function.
  */
 #include "./elemwise_binary_op.h"
+#include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 5e46a68a7a00..babb233e5b80 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9327,4 +9327,82 @@ def test_elemwise_sum_for_gradient_accumulation():
         assert stored_grad['write'] == stored_grad['add']
         assert stored_grad['write'] == 2 * nrepeat
 
+@with_seed()
+def test_elementwise_ops_on_misaligned_input():
+    a = mx.nd.array([1,2,3,4], dtype='float16')
+    b = mx.nd.array([1,2,3,4], dtype='float16')
+
+    c = a[1:3]
+    d = b[1:3]
+    # Note: testing just elemwise_add since all elemwise_ops
+    #       share the implementation
+    mx.nd.elemwise_add(c, d, out=c)
+    mx.nd.waitall()
+
+    a = mx.nd.array([1,2,3,4], dtype='float16')
+    b = mx.nd.array([1,2,3,4], dtype='float16')
+
+    c = a[0:3]
+    d = b[0:3]
+    mx.nd.elemwise_add(c, d, out=c)
+    mx.nd.waitall()
+    assert a[3].asscalar() == 4.0
+
+@with_seed()
+def test_broadcast_ops_on_misaligned_input():
+    dtypes = ['float16', 'float32', 'float64']
+    lead_dims = [2,3,4,6,10]
+
+    for dtype in dtypes:
+        for lead_dim in lead_dims:
+            for both_ways in [False, True]:
+                shape = list(rand_shape_2d()) + [lead_dim]
+                small_shape = [shape[0], 1, lead_dim]
+                if both_ways:
+                    # Broadcast in both ways [1, K, L] x [M, 1, L]
+                    big_shape = [1, shape[1], lead_dim]
+                else:
+                    big_shape = shape
+                size = np.product(shape)
+                small_size = np.product(small_shape)
+                big_size = np.product(big_shape)
+                a = mx.nd.arange(5000)
+                b = mx.nd.arange(5000)
+                e = mx.nd.arange(5000)
+                c = a[1:big_size + 1].reshape(big_shape)
+                d = b[1:small_size + 1].reshape(small_shape)
+                f = e[1:size + 1].reshape(shape)
+                mx.nd.broadcast_add(c, d, out=f)
+                expected = c.asnumpy() + d.asnumpy()
+                mx.nd.waitall()
+                assert_almost_equal(f, expected)
+
+@with_seed()
+def test_broadcast_ops_on_misaligned_input_oneside():
+    dtypes = ['float16', 'float32', 'float64']
+    lead_dims = [2,3,4,6,10]
+
+    for dtype in dtypes:
+        for lead_dim in lead_dims:
+            for both_ways in [False, True]:
+                shape = list(rand_shape_2d()) + [lead_dim]
+                small_shape = [shape[0], shape[1], 1]
+                if both_ways:
+                    # Broadcast in both ways [1, K, L] x [M, 1, 1]
+                    big_shape = [1, shape[1], lead_dim]
+                else:
+                    big_shape = shape
+                size = np.product(shape)
+                small_size = np.product(small_shape)
+                big_size = np.product(big_shape)
+                a = mx.nd.arange(5000)
+                b = mx.nd.arange(5000)
+                e = mx.nd.arange(5000)
+                c = a[1:big_size + 1].reshape(big_shape)
+                d = b[1:small_size + 1].reshape(small_shape)
+                f = e[1:size + 1].reshape(shape)
+                mx.nd.broadcast_add(c, d, out=f)
+                expected = c.asnumpy() + d.asnumpy()
+                mx.nd.waitall()
+                assert_almost_equal(f, expected)
 

From 6b235fdbdd158afdcde4eb84987ebf24dc97026d Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 18 May 2020 13:10:11 -0700
Subject: [PATCH 02/64] Making RTC required

---
 CMakeLists.txt                          | 12 ++++--------
 Makefile                                |  6 +-----
 include/mxnet/libinfo.h                 |  5 -----
 include/mxnet/rtc.h                     |  4 ++--
 src/c_api/c_api.cc                      | 20 ++++++++++----------
 src/common/rtc.cc                       |  4 ++--
 src/imperative/cached_op.h              |  4 ++--
 src/imperative/pointwise_fusion_pass.cc |  6 +++---
 src/libinfo.cc                          |  1 -
 src/operator/fusion/fused_op-inl.h      |  4 ++--
 src/operator/fusion/fused_op.cc         |  4 ++--
 src/operator/fusion/fused_op.cu         |  5 -----
 src/operator/fusion/fused_op.h          |  4 ++--
 13 files changed, 30 insertions(+), 49 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 688dd42c54fe..6e227009eba5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,7 +80,6 @@ option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
 option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
 option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
 option(USE_TVM_OP "Enable use of TVM operator build system." OFF)
-option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON)
 option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
 option(INSTALL_EXAMPLES "Install the example source files." OFF)
 option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
@@ -592,13 +591,10 @@ if(USE_CUDA)
     OPTIONAL_COMPONENTS nvToolsExt nvrtc)
 
   list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand)
-  if(ENABLE_CUDA_RTC)
-    if(CUDA_nvrtc_LIBRARY)
-      list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda)
-      add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
-    else()
-      message(FATAL_ERROR "ENABLE_CUDA_RTC=ON, but failed to find NVRTC. CMake will exit." )
-    endif()
+  if(CUDA_nvrtc_LIBRARY)
+    list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda)
+  else()
+    message(FATAL_ERROR "Failed to find NVRTC. CMake will exit." )
   endif()
   list(APPEND SOURCE ${CUDA})
   add_definitions(-DMXNET_USE_CUDA=1)
diff --git a/Makefile b/Makefile
index 0bf856f677bb..8d3e6f5bb50d 100644
--- a/Makefile
+++ b/Makefile
@@ -518,11 +518,7 @@ ALL_DEP = $(OBJ) $(EXTRA_OBJ) $(PLUGIN_OBJ) $(LIB_DEP)
 ifeq ($(USE_CUDA), 1)
 	CFLAGS += -I$(ROOTDIR)/3rdparty/nvidia_cub
 	ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ) $(PLUGIN_CUOBJ)
-	LDFLAGS += -lcufft
-	ifeq ($(ENABLE_CUDA_RTC), 1)
-		LDFLAGS += -lcuda -lnvrtc
-		CFLAGS += -DMXNET_ENABLE_CUDA_RTC=1
-	endif
+	LDFLAGS += -lcufft -lcuda -lnvrtc
 	# Make sure to add stubs as fallback in order to be able to build
 	# without full CUDA install (especially if run without nvidia-docker)
 	LDFLAGS += -L/usr/local/cuda/lib64/stubs
diff --git a/include/mxnet/libinfo.h b/include/mxnet/libinfo.h
index ade1c731afcf..0612c4d3198c 100644
--- a/include/mxnet/libinfo.h
+++ b/include/mxnet/libinfo.h
@@ -70,10 +70,6 @@
 #define MXNET_USE_CUSOLVER MSHADOW_USE_CUSOLVER
 #endif
 
-#ifndef MXNET_ENABLE_CUDA_RTC
-#define MXNET_ENABLE_CUDA_RTC 0
-#endif
-
 /*! \brief Error message for using gpu when MXNET_USE_CUDA==0 */
 #define MXNET_GPU_NOT_ENABLED_ERROR  "GPU is not enabled"
 
@@ -146,7 +142,6 @@ enum : unsigned {
   CUDA = 0,
   CUDNN,
   NCCL,
-  CUDA_RTC,
   TENSORRT,
 
   // CPU Features / optimizations
diff --git a/include/mxnet/rtc.h b/include/mxnet/rtc.h
index 76c3064db71a..747c0b5c94ab 100644
--- a/include/mxnet/rtc.h
+++ b/include/mxnet/rtc.h
@@ -20,7 +20,7 @@
 #ifndef MXNET_RTC_H_
 #define MXNET_RTC_H_
 #include "./base.h"
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
 #include <nvrtc.h>
 #include <cuda.h>
 
@@ -132,5 +132,5 @@ class CudaModule {
 }  // namespace rtc
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#endif  // MXNET_USE_CUDA
 #endif  // MXNET_RTC_H_
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ea39d9ac6e5b..49b82b63d4a4 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -3029,24 +3029,24 @@ int MXRtcCudaModuleCreate(const char* source, int num_options,
                           const char** options, int num_exports,
                           const char** exports, CudaModuleHandle *out) {
   API_BEGIN();
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
   std::vector<std::string> str_opts;
   for (int i = 0; i < num_options; ++i) str_opts.emplace_back(options[i]);
   std::vector<std::string> str_exports;
   for (int i = 0; i < num_exports; ++i) str_exports.emplace_back(exports[i]);
   *out = new rtc::CudaModule(source, str_opts, str_exports);
 #else
-  LOG(FATAL) << "Compile with USE_CUDA=1 and ENABLE_CUDA_RTC=1 to have CUDA runtime compilation.";
+  LOG(FATAL) << "Compile with USE_CUDA=1 to have CUDA runtime compilation.";
 #endif
   API_END();
 }
 
 int MXRtcCudaModuleFree(CudaModuleHandle handle) {
   API_BEGIN();
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
   delete reinterpret_cast<rtc::CudaModule*>(handle);
 #else
-  LOG(FATAL) << "Compile with USE_CUDA=1 and ENABLE_CUDA_RTC=1 to have CUDA runtime compilation.";
+  LOG(FATAL) << "Compile with USE_CUDA=1 to have CUDA runtime compilation.";
 #endif
   API_END();
 }
@@ -3055,7 +3055,7 @@ int MXRtcCudaKernelCreate(CudaModuleHandle handle, const char* name, int num_arg
                           int* is_ndarray, int* is_const, int* arg_types,
                           CudaKernelHandle *out) {
   API_BEGIN();
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
   auto module = reinterpret_cast<rtc::CudaModule*>(handle);
   std::vector<rtc::CudaModule::ArgType> signature;
   for (int i = 0; i < num_args; ++i) {
@@ -3066,17 +3066,17 @@ int MXRtcCudaKernelCreate(CudaModuleHandle handle, const char* name, int num_arg
   auto kernel = module->GetKernel(name, signature);
   *out = new std::shared_ptr<rtc::CudaModule::Kernel>(kernel);
 #else
-  LOG(FATAL) << "Compile with USE_CUDA=1 and ENABLE_CUDA_RTC=1 to have CUDA runtime compilation.";
+  LOG(FATAL) << "Compile with USE_CUDA=1 to have CUDA runtime compilation.";
 #endif
   API_END();
 }
 
 int MXRtcCudaKernelFree(CudaKernelHandle handle) {
   API_BEGIN();
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
   delete reinterpret_cast<std::shared_ptr<rtc::CudaModule::Kernel>*>(handle);
 #else
-  LOG(FATAL) << "Compile with USE_CUDA=1 and ENABLE_CUDA_RTC=1 to have CUDA runtime compilation.";
+  LOG(FATAL) << "Compile with USE_CUDA=1 to have CUDA runtime compilation.";
 #endif
   API_END();
 }
@@ -3087,7 +3087,7 @@ int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** args,
                         uint32_t block_dim_y, uint32_t block_dim_z,
                         uint32_t shared_mem) {
   API_BEGIN();
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
   auto kernel = reinterpret_cast<std::shared_ptr<rtc::CudaModule::Kernel>*>(handle);
   const auto& signature = (*kernel)->signature();
   std::vector<dmlc::any> any_args;
@@ -3103,7 +3103,7 @@ int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** args,
   (*kernel)->Launch(Context::GPU(dev_id), any_args, grid_dim_x, grid_dim_y,
                     grid_dim_z, block_dim_x, block_dim_y, block_dim_z, shared_mem);
 #else
-  LOG(FATAL) << "Compile with USE_CUDA=1 and ENABLE_CUDA_RTC=1 to have CUDA runtime compilation.";
+  LOG(FATAL) << "Compile with USE_CUDA=1 to have CUDA runtime compilation.";
 #endif
   API_END();
 }
diff --git a/src/common/rtc.cc b/src/common/rtc.cc
index df79ff69ebb7..74c78f281fd0 100644
--- a/src/common/rtc.cc
+++ b/src/common/rtc.cc
@@ -23,7 +23,7 @@
 #include "../common/cuda_utils.h"
 #include "../operator/operator_common.h"
 
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
 
 namespace mxnet {
 namespace rtc {
@@ -186,4 +186,4 @@ void CudaModule::Kernel::Launch(
 }  // namespace rtc
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#endif  // MXNET_USE_CUDA
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index 0c0c7db98174..eeab47b7e178 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -330,7 +330,7 @@ void OptimizeGraph(nnvm::Graph* full_graph, nnvm::Graph* fwd_graph, nnvm::Graph*
                    size_t num_forward_outputs, const bool inlining) {
   input_map->resize(full_graph->indexed_graph().input_nodes().size());
   std::iota(input_map->begin(), input_map->end(), 0);
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
+#if MXNET_USE_CUDA && !defined(_WIN32)
   if (context.dev_mask() == kGPU &&
       !inlining &&
       dmlc::GetEnv("MXNET_USE_FUSION", true)) {
@@ -375,7 +375,7 @@ void OptimizeGraph(nnvm::Graph* full_graph, nnvm::Graph* fwd_graph, nnvm::Graph*
       dmlc::GetEnv("MXNET_USE_FUSION", false)) {
     exec::WarnFusionNotSupported();
   }
-#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
+#endif  // MXNET_USE_CUDA && !defined(_WIN32)
 
   *fwd_graph = nnvm::Graph();
   fwd_graph->outputs = std::vector<nnvm::NodeEntry>(full_graph->outputs.begin(),
diff --git a/src/imperative/pointwise_fusion_pass.cc b/src/imperative/pointwise_fusion_pass.cc
index 3203f67e8b68..656a420eb654 100644
--- a/src/imperative/pointwise_fusion_pass.cc
+++ b/src/imperative/pointwise_fusion_pass.cc
@@ -48,13 +48,13 @@ void WarnFusionNotSupported() {
                  << "Unset env var MXNET_USE_FUSION=1 to quiet this message.";
 #else
     LOG(WARNING) << "Omitting dynamic fused op creation- needs MXNet lib built with "
-                   << "USE_CUDA=1 and ENABLE_CUDA_RTC=1.  Unset env var MXNET_USE_FUSION=1 "
+                   << "USE_CUDA=1.  Unset env var MXNET_USE_FUSION=1 "
                    << "to quiet this message.";
 #endif  // defined(_WIN32)
   }
 }
 
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
 
 namespace {
   bool IsFusionCompatible(nnvm::Node* n) {
@@ -334,7 +334,7 @@ Graph FusePointwiseBackward(Graph &&g) {
   ret.outputs = g.outputs;
   return ret;
 }
-#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#endif  // MXNET_USE_CUDA
 
 }  // namespace exec
 }  // namespace mxnet
diff --git a/src/libinfo.cc b/src/libinfo.cc
index 211444e857d2..dd535093f0a0 100644
--- a/src/libinfo.cc
+++ b/src/libinfo.cc
@@ -39,7 +39,6 @@ class FeatureSet {
     feature_bits.set(CUDA, MXNET_USE_CUDA);
     feature_bits.set(CUDNN, MXNET_USE_CUDNN);
     feature_bits.set(NCCL, MXNET_USE_NCCL);
-    feature_bits.set(CUDA_RTC, MXNET_ENABLE_CUDA_RTC);
     feature_bits.set(TENSORRT, MXNET_USE_TENSORRT);
 
     // Check flags for example with gcc -msse3 -mavx2 -dM -E - < /dev/null | egrep "SSE|AVX"
diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h
index 0b10f821d8e1..58d0410a508e 100644
--- a/src/operator/fusion/fused_op-inl.h
+++ b/src/operator/fusion/fused_op-inl.h
@@ -24,7 +24,7 @@
 #include <map>
 #include <vector>
 
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
 
 namespace mxnet {
 
@@ -1028,6 +1028,6 @@ const char kernel_end[] = R"code(}
 
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#endif  // MXNET_USE_CUDA
 
 #endif  // MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_
diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc
index 596f4e7146e0..fafc75d9aa93 100644
--- a/src/operator/fusion/fused_op.cc
+++ b/src/operator/fusion/fused_op.cc
@@ -23,7 +23,7 @@
 #include "../operator_common.h"
 #include "../../imperative/exec_pass.h"
 
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
 
 namespace mxnet {
 
@@ -302,4 +302,4 @@ NNVM_REGISTER_OP(_FusedOpOutHelper)
 
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#endif  // MXNET_USE_CUDA
diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu
index fe667946a0c4..39b576b47a65 100644
--- a/src/operator/fusion/fused_op.cu
+++ b/src/operator/fusion/fused_op.cu
@@ -17,9 +17,6 @@
  * under the License.
  */
 
-// Additional use of MXNET_USE_CUDA is not needed to guard a '.cu' file.
-#if MXNET_ENABLE_CUDA_RTC
-
 #include <sys/stat.h>
 #include <nvrtc.h>
 #include <cuda.h>
@@ -840,5 +837,3 @@ NNVM_REGISTER_OP(_FusedOp)
 .set_attr<FCompute>("FCompute<gpu>", FusedOpForwardGPU);
 
 }  // namespace mxnet
-
-#endif  // MXNET_ENABLE_CUDA_RTC
diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h
index 3a1db4e2a369..f4cb0600c80f 100644
--- a/src/operator/fusion/fused_op.h
+++ b/src/operator/fusion/fused_op.h
@@ -28,7 +28,7 @@
 #include <mutex>
 #include <tuple>
 
-#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#if MXNET_USE_CUDA
 
 namespace mxnet {
 
@@ -201,6 +201,6 @@ using FusedOpHelperParamPtr = std::shared_ptr<FusedOpHelperParam>;
 
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+#endif  // MXNET_USE_CUDA
 
 #endif  // MXNET_OPERATOR_FUSION_FUSED_OP_H_

From 38f30f06bbb254c52925401dfe03fd1fb4c1ca43 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 19 May 2020 13:29:35 -0700
Subject: [PATCH 03/64] Move cuda utils to src/common/cuda and refactor RTC
 part

---
 src/common/cuda/rtc.cc                        | 177 ++++
 src/common/cuda/rtc.h                         |  65 ++
 src/common/cuda/rtc/backward_functions-inl.h  | 224 +++++
 src/common/cuda/rtc/forward_functions-inl.h   | 599 +++++++++++++
 src/common/cuda/rtc/half-inl.h                |  57 ++
 src/common/cuda/rtc/type-inl.h                |  47 ++
 src/common/{cuda_utils.cc => cuda/utils.cc}   |   2 +-
 src/common/{cuda_utils.h => cuda/utils.h}     |   0
 .../vectorization.cuh}                        |   2 +-
 src/common/rtc.cc                             |   2 +-
 src/engine/stream_manager.h                   |   2 +-
 src/engine/threaded_engine.cc                 |   2 +-
 src/engine/threaded_engine_pooled.cc          |   2 +-
 src/kvstore/kvstore_nccl.h                    |   2 +-
 src/ndarray/ndarray_function.cu               |   2 +-
 src/operator/bilinear_sampler.cu              |   2 +-
 .../contrib/deformable_psroi_pooling.cu       |   2 +-
 src/operator/contrib/nn/deformable_im2col.cuh |   2 +-
 src/operator/contrib/psroi_pooling.cu         |   2 +-
 src/operator/contrib/transformer.cu           |   2 +-
 src/operator/fusion/fused_op-inl.h            | 796 ------------------
 src/operator/fusion/fused_op.cu               | 121 +--
 src/operator/linalg_impl.h                    |   2 +-
 src/operator/mxnet_op.h                       |   2 +-
 src/operator/nn/batch_norm.cu                 |   1 -
 src/operator/nn/cudnn/cudnn_activation-inl.h  |   2 +-
 src/operator/nn/cudnn/cudnn_algoreg-inl.h     |   2 +-
 src/operator/nn/cudnn/cudnn_convolution-inl.h |   2 +-
 .../nn/cudnn/cudnn_deconvolution-inl.h        |   2 +-
 src/operator/nn/depthwise_convolution-inl.h   |   2 +-
 src/operator/nn/depthwise_convolution_tf.cuh  |   2 +-
 src/operator/nn/pool.cuh                      |   2 +-
 src/operator/nn/softmax-inl.h                 |   2 +-
 src/operator/operator_common.h                |   2 +-
 src/operator/pad.cu                           |   2 +-
 src/operator/tensor/broadcast_reduce-inl.h    |   2 +-
 src/operator/tensor/elemwise_binary_op.cuh    |   2 +-
 .../tensor/elemwise_binary_scalar_op.cuh      |   2 +-
 src/operator/tensor/elemwise_sum.cu           |   2 +-
 src/operator/tensor/elemwise_unary_op.cuh     |   2 +-
 .../tensor/pseudo2DTranspose_op-inl.cuh       |   2 +-
 src/profiler/profiler.cc                      |   2 +-
 src/resource.cc                               |   2 +-
 src/storage/gpu_device_storage.h              |   2 +-
 src/storage/pinned_memory_storage.h           |   2 +-
 src/storage/pooled_storage_manager.h          |   2 +-
 46 files changed, 1212 insertions(+), 947 deletions(-)
 create mode 100644 src/common/cuda/rtc.cc
 create mode 100644 src/common/cuda/rtc.h
 create mode 100644 src/common/cuda/rtc/backward_functions-inl.h
 create mode 100644 src/common/cuda/rtc/forward_functions-inl.h
 create mode 100644 src/common/cuda/rtc/half-inl.h
 create mode 100644 src/common/cuda/rtc/type-inl.h
 rename src/common/{cuda_utils.cc => cuda/utils.cc} (99%)
 rename src/common/{cuda_utils.h => cuda/utils.h} (100%)
 rename src/common/{cuda_vectorization.cuh => cuda/vectorization.cuh} (99%)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
new file mode 100644
index 000000000000..6d3bb62cd3b8
--- /dev/null
+++ b/src/common/cuda/rtc.cc
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "mxnet/base.h"
+
+#if MXNET_USE_CUDA
+
+#include <nvrtc.h>
+
+#include <mutex>
+#include <string>
+
+#include "rtc.h"
+#include "rtc/half-inl.h"
+#include "rtc/type-inl.h"
+#include "rtc/forward_functions-inl.h"
+#include "rtc/backward_functions-inl.h"
+#include "utils.h"
+
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+std::mutex lock;
+
+namespace {
+
+// Obtain compilation log from the program.
+std::string GetCompileLog(nvrtcProgram program) {
+  size_t log_size_including_null;
+  NVRTC_CALL(nvrtcGetProgramLogSize(program, &log_size_including_null));
+  // For most std::string implementations, this is probably 1 char bigger than needed.  OK though.
+  std::string log(log_size_including_null, '\0');
+  NVRTC_CALL(nvrtcGetProgramLog(program, &log[0]));
+  // Make sure the string reflects the true size (so minus the null terminator).
+  log.resize(log_size_including_null - 1);
+  return log;
+}
+
+// Obtain compilation result (ptx assembly) from the program.
+std::string GetPtx(nvrtcProgram program) {
+  size_t ptx_size_including_null;
+  NVRTC_CALL(nvrtcGetPTXSize(program, &ptx_size_including_null));
+  // For most std::string implementations, this is probably 1 char bigger than needed.  OK though.
+  std::string ptx(ptx_size_including_null, '\0');
+  NVRTC_CALL(nvrtcGetPTX(program, &ptx[0]));
+  // Make sure the string reflects the true size (so minus the null terminator).
+  ptx.resize(ptx_size_including_null - 1);
+  return ptx;
+}
+
+}  // namespace
+
+CUfunction get_function(const std::string &code,
+                        const std::string &kernel_name,
+                        int dev_id) {
+  constexpr int CACHESIZE_WARN_THRESHOLD = 10000;
+  std::lock_guard<std::mutex> l(lock);
+  // Local class for value type of compile cache
+  struct KernelInfo {
+    std::string mangled_name;
+    std::string ptx;
+    std::vector<CUfunction> functions;
+  };
+  // Maps from the cuda source code (minus header) to the ptx and jit-compiled CUfunctions.
+  using KernelCache = std::map<std::string, KernelInfo>;
+  // Per-gpu-architecture compiled kernel cache with jit-compiled function for each device context
+  static std::map<int32_t, KernelCache> compiled_kernels;
+  int sm_arch = SMArch(dev_id);
+  KernelCache& compiled_kernels_this_arch = compiled_kernels[sm_arch];  // make null map as needed
+  KernelInfo& kinfo = compiled_kernels_this_arch[code];                 // make KernelInfo as needed
+  if (kinfo.ptx.size() == 0) {
+    // It's the first time we've seen this kernel, so we need to generate the ptx and mangled_name.
+    static std::string common_header =
+        std::string(fp16_support_string) + "\n" +
+        type_support_string + "\n" +
+        function_definitions + "\n" +
+        backward_function_definitions + "\n";
+    std::string code_with_header = common_header + code;
+    // If verbose mode, output kernel source, though not including the common header
+    if (dmlc::GetEnv("MXNET_RTC_VERBOSE", false)) {
+      LOG(INFO) << "\n" << std::string(80, '-') << "\n" << code;
+    }
+    if (compiled_kernels_this_arch.size() == CACHESIZE_WARN_THRESHOLD + 1 &&
+        dmlc::GetEnv("MXNET_RTC_SIZE_WARNING", true)) {
+      LOG(WARNING) << "The number of different compiled kernels exceeds " << CACHESIZE_WARN_THRESHOLD
+                   << ".  Set MXNET_RTC_SIZE_WARNING=0 to quiet this warning.";
+    }
+    nvrtcProgram program;
+    NVRTC_CALL(nvrtcCreateProgram(&program,                                  // prog
+                                  &code_with_header[0],                      // buffer
+                                  (kernel_name + "_kernel.cu").c_str(),      // name
+                                  0,                                         // num headers
+                                  nullptr,                                   // headers
+                                  nullptr));                                 // include names
+
+    std::string gpu_arch_arg = "--gpu-architecture=compute_" + std::to_string(sm_arch);
+    const char *opts[] = {gpu_arch_arg.c_str(),
+                          "--std=c++11"};
+    const std::string kernel_name_demangled = "FusedKernel_" + kernel_name;
+    NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
+
+    nvrtcResult compileResult = nvrtcCompileProgram(program,  // prog
+                                                    2,        // num options
+                                                    opts);    // options
+    CHECK_EQ(compileResult, NVRTC_SUCCESS)
+        << "NVRTC Compilation failed. Please set environment variable MXNET_USE_FUSION to 0.\n"
+        << GetCompileLog(program);
+
+    kinfo.ptx = GetPtx(program);
+    const char *mangled_name;
+    NVRTC_CALL(nvrtcGetLoweredName(program,
+                                   kernel_name_demangled.c_str(),
+                                   &mangled_name));
+    kinfo.mangled_name = mangled_name;
+    // Destroy the program.
+    NVRTC_CALL(nvrtcDestroyProgram(&program));
+  }
+  // Ensure function array is deep enough to index by dev_id
+  while (kinfo.functions.size() <= static_cast<size_t>(dev_id))
+    kinfo.functions.push_back(static_cast<CUfunction>(nullptr));
+  // Jit-compile ptx for the device as needed
+  if (kinfo.functions[dev_id] == static_cast<CUfunction>(nullptr)) {
+    // Make sure driver context is set to the proper device
+    CUdevice cu_device;
+    CUcontext context;
+    CUDA_DRIVER_CALL(cuDeviceGet(&cu_device, dev_id));
+    CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cu_device));
+    // Jit-compile ptx for the driver's current context
+    CUmodule module;
+    CUDA_DRIVER_CALL(cuModuleLoadData(&module, kinfo.ptx.c_str()));
+    CUDA_DRIVER_CALL(cuModuleGetFunction(&kinfo.functions[dev_id],
+                                         module,
+                                         kinfo.mangled_name.c_str()));
+  }
+  return kinfo.functions[dev_id];
+}
+
+void launch(CUfunction function,
+            const dim3 grid_dim,
+            const dim3 block_dim,
+            unsigned int shared_mem_bytes,
+            mshadow::Stream<gpu> *stream,
+            std::vector<void*> *args) {
+  CUDA_DRIVER_CALL(
+      cuLaunchKernel(function,                   // function to launch
+        grid_dim.x, grid_dim.y, grid_dim.z,      // grid dim
+        block_dim.x, block_dim.y, block_dim.z,   // block dim
+        shared_mem_bytes,                        // shared memory
+        mshadow::Stream<gpu>::GetStream(stream), // stream
+        &((*args)[0]), nullptr));                // arguments
+}
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
diff --git a/src/common/cuda/rtc.h b/src/common/cuda/rtc.h
new file mode 100644
index 000000000000..811936915d31
--- /dev/null
+++ b/src/common/cuda/rtc.h
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file cuda_rtc.h
+ * \brief Common CUDA utilities for
+ *        runtime compilation.
+ */
+
+#ifndef MXNET_COMMON_CUDA_RTC_H_
+#define MXNET_COMMON_CUDA_RTC_H_
+
+#include "mxnet/base.h"
+
+#if MXNET_USE_CUDA
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <mutex>
+#include <string>
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+extern std::mutex lock;
+
+CUfunction get_function(const std::string &code,
+                        const std::string &kernel_name,
+                        int dev_id);
+
+void launch(CUfunction function,
+            const dim3 grid_dim,
+            const dim3 block_dim,
+            unsigned int shared_mem_bytes,
+            mshadow::Stream<gpu> *stream,
+            std::vector<void*> *args);
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#endif  // MXNET_COMMON_CUDA_RTC_H_
diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
new file mode 100644
index 000000000000..b5ae8a361a3b
--- /dev/null
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_COMMON_CUDA_RTC_BACKWARD_FUNCTIONS_INL_H_
+#define MXNET_COMMON_CUDA_RTC_BACKWARD_FUNCTIONS_INL_H_
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+const char backward_function_definitions[] = R"code(
+
+namespace op {
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_relu(const DType val, const DTypeGrad grad) {
+  return val > 0 ? grad : 0;
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_sigmoid(const DType out, const DTypeGrad grad) {
+  return grad * out * (1 - out);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_softrelu(const DType val, const DTypeGrad grad) {
+  return grad * sigmoid(val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_softsign(const DType val, const DTypeGrad grad) {
+  const DType ap1 = 1 + fabsf(val);
+  return grad / (ap1 * ap1);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_exp(const DType val, const DTypeGrad grad) {
+  return grad * expf(val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_expm1(const DType val, const DTypeGrad grad) {
+  return grad * expf(val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_log(const DType val, const DTypeGrad grad) {
+  return grad / val;
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_log10(const DType val, const DTypeGrad grad) {
+  return grad / (val * logf(10));
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_log2(const DType val, const DTypeGrad grad) {
+  return grad / (val * logf(2));
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_log1p(const DType val, const DTypeGrad grad) {
+  return grad / (1 + val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_sin(const DType val, const DTypeGrad grad) {
+  return grad * cosf(val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_cos(const DType val, const DTypeGrad grad) {
+  return -grad * sinf(val);
+}
+
+// Uses output from tan
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_tan(const DType out, const DTypeGrad grad) {
+  return grad * (out * out + 1);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_arcsin(const DType val, const DTypeGrad grad) {
+  return grad / sqrtf(1 - val*val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_arccos(const DType val, const DTypeGrad grad) {
+  return -grad / sqrtf(1 - val*val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_arctan(const DType val, const DTypeGrad grad) {
+  return grad / (1 + val*val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_sinh(const DType val, const DTypeGrad grad) {
+  return grad * coshf(val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_cosh(const DType val, const DTypeGrad grad) {
+  return grad * sinhf(val);
+}
+
+// Uses tanh output
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_tanh(const DType out, const DTypeGrad grad) {
+  return grad * (1 - out * out);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_arcsinh(const DType val, const DTypeGrad grad) {
+  return grad / sqrtf(val * val + 1);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_arccosh(const DType val, const DTypeGrad grad) {
+  return grad / sqrtf(val * val - 1);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_arctanh(const DType val, const DTypeGrad grad) {
+  return grad / (1 - val * val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_sqrt(const DType out, const DTypeGrad grad) {
+  return 0.5 * grad / out;
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_rsqrt(const DType val, const DTypeGrad grad) {
+  const DType inv = 1 / val;
+  return -0.5 * grad * sqrtf(inv) * inv;
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_cbrt(const DType out, const DTypeGrad grad) {
+  return grad / (3.0f * out * out);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_rcbrt(const DType val, const DTypeGrad grad) {
+  const DType inv = 1 / val;
+  return -1.f/3.f * grad * cbrtf(inv) * inv;
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_square(const DType val, const DTypeGrad grad) {
+  return 2 * val * grad;
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_clip(const DType val, const DTypeGrad grad,
+                                          const float a_min, const float a_max) {
+  if (val > a_max || val < a_min) {
+    return 0;
+  } else {
+    return grad;
+  }
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_reciprocal(const DType val, const DTypeGrad grad) {
+  return -grad / (val * val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_erf(const DType val, const DTypeGrad grad) {
+  return 2.0f / sqrt(pi) * exp(-(val*val)) * grad;
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_erfinv(const DType val, const DTypeGrad grad) {
+  return 0.5f * sqrt(pi) * exp(val * val) * grad;
+}
+
+template <typename DType, typename DType2, typename DTypeGrad>
+__device__ inline DTypeGrad backward_smooth_l1(const DType val, const DType2 scalar,
+                                               const DTypeGrad grad) {
+  auto bsq = scalar * scalar;
+  auto ibsq = 1.0f / bsq;
+  if (val > ibsq) {
+    return grad;
+  } else if (val < -ibsq) {
+    return -grad;
+  } else {
+    return bsq * val * grad;
+  }
+}
+
+}  // namespace op
+
+)code";
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#endif  // MXNET_COMMON_CUDA_RTC_BACKWARD_FUNCTIONS_INL_H_
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
new file mode 100644
index 000000000000..548d2392a393
--- /dev/null
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -0,0 +1,599 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_COMMON_CUDA_RTC_FORWARD_FUNCTIONS_INL_H_
+#define MXNET_COMMON_CUDA_RTC_FORWARD_FUNCTIONS_INL_H_
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+const char function_definitions[] = R"code(
+
+#define INT_MAX (2147483647)
+
+namespace op {
+
+template <typename DType>
+struct LoadType {
+  using Type = DType;
+};
+
+template <>
+struct LoadType<half> {
+  using Type = float;
+};
+
+template <typename DType>
+__device__ inline typename LoadType<DType>::Type load(const DType input) {
+  return input;
+}
+
+template <>
+__device__ inline float load(const half input) {
+  return __half2float(input);
+}
+
+template <typename DType1, typename DType2>
+__device__ inline DType1 store(const DType2 input, DType1* ref) {
+  return input;
+}
+
+template <typename DType>
+__device__ inline half store(const DType input, half* ref) {
+  return __float2half(input);
+}
+
+template <int size>
+struct VectorConfig {
+    static_assert(size >= 4, "VectorConfig needs to have size of at least 4B");
+    using IndexType = float;
+};
+
+template <>
+struct VectorConfig<8> {
+    using IndexType = double;
+};
+
+template <>
+struct VectorConfig<16> {
+    using IndexType = double2;
+};
+
+template <>
+struct VectorConfig<32> {
+    using IndexType = double4;
+};
+
+template <typename DType>
+__device__ inline DType add_elem(const DType& x, const DType& y) {
+  return x + y;
+}
+
+template <>
+__device__ inline half add_elem(const half& x, const half& y) {
+  return __float2half(__half2float(x) + __half2float(y));
+}
+
+template <typename DType, int nvec>
+union VectorType {
+    typename VectorConfig<sizeof(DType)*nvec>::IndexType y;
+    DType x[nvec];
+    __device__ VectorType () {};
+    __device__ VectorType (const VectorType<DType, nvec>& y2) {
+        y = y2.y;
+    }
+    __device__ VectorType (const decltype(y) &y2) {
+        y = y2;
+    }
+    __device__ inline VectorType<DType, nvec>& operator+=(const VectorType<DType, nvec>& rhs) {
+      #pragma unroll
+      for (int i = 0; i < nvec; ++i) {
+        x[i] = add_elem(x[i], rhs.x[i]);
+      }
+      return *this;
+    }
+};
+
+template <int ndim>
+struct Shape {
+   int x[ndim];
+   size_t size;
+   __device__ inline const int& operator [](const int i) const {
+       return x[i];
+   }
+   __device__ inline int& operator [](const int i) {
+       return x[i];
+   }
+   __device__ inline void set(const int def) {
+       #pragma unroll
+       for (int i = 0; i < ndim; i++) {
+           x[i] = def;
+       }
+   }
+};
+
+template <>
+struct Shape<0> {
+   size_t size;
+};
+
+template <int nvec, typename DType, int ndim>
+__device__ inline VectorType<DType, nvec> load_index(const DType * input, int i,
+                                                     const Shape<ndim> &shape) {
+  if (i < shape.size) {
+    const auto* vector_input = reinterpret_cast<
+                                const typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(
+                                    input + i);
+    VectorType<DType, nvec> ret = {*vector_input};
+    return ret;
+  } else {
+    VectorType<DType, nvec> ret({0});
+    return ret;
+  }
+}
+
+template <int nvec, typename DType, int ndim>
+__device__ inline VectorType<DType, nvec> global_load_index(const DType * input, int i,
+                                                            const Shape<ndim> &shape) {
+  if (i < shape.size) {
+    const auto* vector_input = reinterpret_cast<
+                                const typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(
+                                    input + i);
+    VectorType<DType, nvec> ret = {__ldg(vector_input)};
+    return ret;
+  } else {
+    VectorType<DType, nvec> ret({0});
+    return ret;
+  }
+}
+
+template <int nvec, typename DType, int ndim>
+__device__ inline VectorType<DType, nvec> load_slice(const DType * input, const Shape<ndim>& shape,
+                                                     Shape<ndim> begin, Shape<ndim> end,
+                                                     int offset) {
+  int idx[nvec];
+
+  Shape<ndim> ref_strides;
+  Shape<ndim> strides;
+  ref_strides[ndim-1] = 1;
+  strides[ndim-1] = 1;
+  #pragma unroll
+  for (int dim = ndim-1; dim >=0; dim--) {
+    if (begin[dim] < 0) begin[dim] = shape[dim] + begin[dim];
+    if (end[dim] < 0) end[dim] = shape[dim] + end[dim];
+    if (end[dim] == INT_MAX) end[dim] = shape[dim];
+    if (dim > 0) {
+      ref_strides[dim-1] = ref_strides[dim] * (end[dim] - begin[dim]);
+      strides[dim-1] = strides[dim] * shape[dim];
+    }
+  }
+  #pragma unroll
+  for (int j = 0; j < nvec; j++) {
+    idx[j] = 0;
+    int ref_idx = offset + j;
+    #pragma unroll
+    for (int dim = 0; dim < ndim; dim++) {
+       int stride = ref_strides[dim];
+       if (shape[dim] > 1) {
+         idx[j] += (ref_idx / stride + begin[dim]) * strides[dim];
+       }
+       ref_idx = ref_idx % stride;
+    }
+  }
+  VectorType<DType, nvec> ret;
+  #pragma unroll
+  for (int j = 0; j < nvec; j++) {
+      ret.x[j] = *(input + idx[j]);
+  }
+  return ret;
+}
+
+template <int nvec, typename DType, int ndim>
+__device__ inline VectorType<DType, nvec> fast_load_slice(const DType * input,
+                                                          const Shape<ndim>& shape,
+                                                          Shape<ndim> begin,
+                                                          Shape<ndim> end,
+                                                          int offset) {
+  int idx = 0;
+
+  Shape<ndim> ref_strides;
+  Shape<ndim> strides;
+  ref_strides[ndim-1] = 1;
+  strides[ndim-1] = 1;
+  #pragma unroll
+  for (int dim = ndim-1; dim >=0; dim--) {
+    if (begin[dim] < 0) begin[dim] = shape[dim] + begin[dim];
+    if (end[dim] < 0) end[dim] = shape[dim] + end[dim];
+    if (end[dim] == INT_MAX) end[dim] = shape[dim];
+    if (dim > 0) {
+      ref_strides[dim-1] = ref_strides[dim] * (end[dim] - begin[dim]);
+      strides[dim-1] = strides[dim] * shape[dim];
+    }
+  }
+  int ref_idx = offset;
+  #pragma unroll
+  for (int dim = 0; dim < ndim; dim++) {
+     int stride = ref_strides[dim];
+     if (shape[dim] > 1) {
+       idx += (ref_idx / stride + begin[dim]) * strides[dim];
+     }
+     ref_idx = ref_idx % stride;
+  }
+  return global_load_index<nvec>(input, idx, shape);
+}
+
+template <int nvec, typename DType, int ndim>
+__device__ inline void store_index(const VectorType<DType, nvec> value, int i,
+                        DType * output, const Shape<ndim>& shape) {
+  if (i < (shape.size + nvec - 1) / nvec) {
+    auto vector_output = reinterpret_cast<
+                          typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(output);
+    vector_output[i] = value.y;
+  }
+}
+
+template <int nvec, typename DType, int ndim>
+__device__ inline void store_add_index(const VectorType<DType, nvec> value, int i,
+                            DType * output, const Shape<ndim>& shape) {
+  if (i < (shape.size + nvec - 1) / nvec) {
+    auto vector_output = reinterpret_cast<
+                          typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(output);
+    VectorType<DType, nvec> ret(vector_output[i]);
+    ret += value;
+    vector_output[i] = ret.y;
+  }
+}
+
+template <typename DType>
+__device__ inline DType identity(const DType val) {
+  return val;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType add(const DType a, const DType2 b) {
+  return a + b;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType sub(const DType a, const DType2 b) {
+  return a - b;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType mul(const DType a, const DType2 b) {
+  return a * b;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType div(const DType a, const DType2 b) {
+  return a / b;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType rdiv(const DType a, const DType2 b) {
+  return b / a;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType power(const DType a, const DType2 b) {
+  return powf(a, b);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType rpow(const DType a, const DType2 b) {
+  return powf(b, a);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType max(const DType a, const DType2 b) {
+  return a > b ? a : b;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType min(const DType a, const DType2 b) {
+  return a < b ? a : b;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType hypot(const DType a, const DType2 b) {
+  return hypotf(a, b);
+}
+
+template <typename OutType, typename DType>
+__device__ inline typename LoadType<OutType>::Type cast(const DType val) {
+  return static_cast<typename LoadType<OutType>::Type>(val);
+}
+
+// activations
+
+template <typename DType>
+__device__ inline DType relu(const DType val) {
+  return val > 0 ? val : 0;
+}
+
+template <typename DType>
+__device__ inline DType sigmoid(const DType val) {
+  return 1.f/(1 + expf(-val));
+}
+
+template <typename DType>
+__device__ inline DType softrelu(const DType val) {
+  return logf(1 + expf(val));
+}
+
+template <typename DType>
+__device__ inline DType softsign(const DType val) {
+  return val / (1 + fabsf(val));
+}
+
+// exp and log
+
+template <typename DType>
+__device__ inline DType exp(const DType val) {
+  return expf(val);
+}
+
+template <typename DType>
+__device__ inline DType expm1(const DType val) {
+  return expm1f(val);
+}
+
+template <typename DType>
+__device__ inline DType log(const DType val) {
+  return logf(val);
+}
+
+template <typename DType>
+__device__ inline DType log10(const DType val) {
+  return log10f(val);
+}
+
+template <typename DType>
+__device__ inline DType log2(const DType val) {
+  return log2f(val);
+}
+
+template <typename DType>
+__device__ inline DType log1p(const DType val) {
+  return log1pf(val);
+}
+
+// trigonometric
+
+constexpr double pi = 3.14159265358979323846;
+
+template <typename DType>
+__device__ inline DType degrees(const DType val) {
+  return (val / pi) * 180;
+}
+
+template <typename DType>
+__device__ inline DType radians(const DType val) {
+  return (val / 180.0) * pi;
+}
+
+template <typename DType>
+__device__ inline DType sin(const DType val) {
+  return sinf(val);
+}
+
+template <typename DType>
+__device__ inline DType cos(const DType val) {
+  return cosf(val);
+}
+
+template <typename DType>
+__device__ inline DType tan(const DType val) {
+  return tanf(val);
+}
+
+template <typename DType>
+__device__ inline DType arcsin(const DType val) {
+  return asinf(val);
+}
+
+template <typename DType>
+__device__ inline DType arccos(const DType val) {
+  return acosf(val);
+}
+
+template <typename DType>
+__device__ inline DType arctan(const DType val) {
+  return atanf(val);
+}
+
+template <typename DType>
+__device__ inline DType sinh(const DType val) {
+  return sinhf(val);
+}
+
+template <typename DType>
+__device__ inline DType cosh(const DType val) {
+  return coshf(val);
+}
+
+template <typename DType>
+__device__ inline DType tanh(const DType val) {
+  return tanhf(val);
+}
+
+template <typename DType>
+__device__ inline DType arcsinh(const DType val) {
+  return asinhf(val);
+}
+
+template <typename DType>
+__device__ inline DType arccosh(const DType val) {
+  return acoshf(val);
+}
+
+template <typename DType>
+__device__ inline DType arctanh(const DType val) {
+  return atanhf(val);
+}
+
+// sqrt
+
+template <typename DType>
+__device__ inline DType sqrt(const DType val) {
+  return sqrtf(val);
+}
+
+template <typename DType>
+__device__ inline DType rsqrt(const DType val) {
+  return rsqrtf(val);
+}
+
+template <typename DType>
+__device__ inline DType cbrt(const DType val) {
+  return cbrtf(val);
+}
+
+template <typename DType>
+__device__ inline DType rcbrt(const DType val) {
+  return rcbrtf(val);
+}
+
+template <typename DType>
+__device__ inline DType square(const DType val) {
+  return val * val;
+}
+
+template <typename DType>
+__device__ inline typename LoadType<DType>::Type zero(const DType val) {
+  return 0;
+}
+
+template <typename DType>
+__device__ inline typename LoadType<DType>::Type zero() {
+  return 0;
+}
+
+template <typename DType>
+__device__ inline typename LoadType<DType>::Type one(const DType val) {
+  return 1;
+}
+
+template <typename DType>
+__device__ inline typename LoadType<DType>::Type one() {
+  return 1;
+}
+
+template <typename DType>
+__device__ inline DType round(const DType val) {
+  return roundf(val);
+}
+
+template <typename DType>
+__device__ inline DType rint(const DType val) {
+  return rintf(val);
+}
+
+template <typename DType>
+__device__ inline DType fix(const DType val) {
+    const auto floor = floorf(val);
+    const auto ceil = ceilf(val);
+    return (floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil;
+}
+
+template <typename DType>
+__device__ inline DType floor(const DType val) {
+    return floorf(val);
+}
+
+template <typename DType>
+__device__ inline DType ceil(const DType val) {
+    return ceilf(val);
+}
+
+template <typename DType>
+__device__ inline DType trunc(const DType val) {
+    return truncf(val);
+}
+
+template <typename DType>
+__device__ inline DType clip(const DType val, const float a_min, const float a_max) {
+  return max(min(val, a_max), a_min);
+}
+
+template <typename DType>
+__device__ inline DType sign(const DType val) {
+  if (val < 0) return -1;
+  return val > 0 ? 1 : 0;
+}
+
+template <typename DType>
+__device__ inline DType reciprocal(const DType val) {
+  return 1.0f / val;
+}
+
+template <typename DType>
+__device__ inline DType abs(const DType val) {
+  return fabsf(val);
+}
+
+template <typename DType>
+__device__ inline DType gamma(const DType val) {
+  return tgammaf(val);
+}
+
+template <typename DType>
+__device__ inline DType gammaln(const DType val) {
+  return lgammaf(val);
+}
+
+template <typename DType>
+__device__ inline DType erf(const DType val) {
+  return erff(val);
+}
+
+template <typename DType>
+__device__ inline DType erfinv(const DType val) {
+  return erfinvf(val);
+}
+
+template <typename DType1, typename DType2>
+__device__ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) {
+  const auto bsq = scalar * scalar;
+  const auto ibsq = 1.0f / bsq;
+  if (val > ibsq) {
+    return val - 0.5f * ibsq;
+  } else if (val < -ibsq) {
+    return -val - 0.5f * ibsq;
+  } else {
+    return 0.5f * val * val * bsq;
+  }
+}
+
+}  // namespace op
+
+)code";
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#endif  // MXNET_COMMON_CUDA_RTC_FORWARD_FUNCTIONS_INL_H_
diff --git a/src/common/cuda/rtc/half-inl.h b/src/common/cuda/rtc/half-inl.h
new file mode 100644
index 000000000000..a467a5efdfa1
--- /dev/null
+++ b/src/common/cuda/rtc/half-inl.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_COMMON_CUDA_RTC_HALF_INL_H_
+#define MXNET_COMMON_CUDA_RTC_HALF_INL_H_
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+const char fp16_support_string[] = R"code(
+struct __align__(2) __half {
+  __host__ __device__ __half() { }
+  unsigned short __x;
+};
+/* Definitions of intrinsics */
+__device__ inline __half __float2half(const float f) {
+  __half val;
+ asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val.__x) : "f"(f));
+  return val;
+}
+__device__ inline float __half2float(const __half h) {
+  float val;
+ asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.__x));
+  return val;
+}
+
+typedef __half half;
+)code";
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#endif  // MXNET_COMMON_CUDA_RTC_HALF_INL_H_
diff --git a/src/common/cuda/rtc/type-inl.h b/src/common/cuda/rtc/type-inl.h
new file mode 100644
index 000000000000..927a8f43443a
--- /dev/null
+++ b/src/common/cuda/rtc/type-inl.h
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_COMMON_CUDA_RTC_TYPE_INL_H_
+#define MXNET_COMMON_CUDA_RTC_TYPE_INL_H_
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+const char type_support_string[] = R"code(
+using float32 = float;
+using float64 = double;
+using float16 = half;
+using uint8 = unsigned char;
+using int8 = char;
+using int32 = int;
+using int64 = long long;
+)code";
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#endif  // MXNET_COMMON_CUDA_RTC_TYPE_INL_H_
diff --git a/src/common/cuda_utils.cc b/src/common/cuda/utils.cc
similarity index 99%
rename from src/common/cuda_utils.cc
rename to src/common/cuda/utils.cc
index 893b34e6ff29..b87c39386604 100644
--- a/src/common/cuda_utils.cc
+++ b/src/common/cuda/utils.cc
@@ -28,7 +28,7 @@
 
 #include <algorithm>
 
-#include "cuda_utils.h"
+#include "utils.h"
 
 #if MXNET_USE_CUDA
 
diff --git a/src/common/cuda_utils.h b/src/common/cuda/utils.h
similarity index 100%
rename from src/common/cuda_utils.h
rename to src/common/cuda/utils.h
diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda/vectorization.cuh
similarity index 99%
rename from src/common/cuda_vectorization.cuh
rename to src/common/cuda/vectorization.cuh
index 7803afb901ab..56253d3535be 100644
--- a/src/common/cuda_vectorization.cuh
+++ b/src/common/cuda/vectorization.cuh
@@ -29,7 +29,7 @@
 #if MXNET_USE_CUDA && __CUDACC__
 
 #include <cuda_runtime.h>
-#include "cuda_utils.h"
+#include "utils.h"
 
 
 namespace mxnet {
diff --git a/src/common/rtc.cc b/src/common/rtc.cc
index 74c78f281fd0..21d3061e5209 100644
--- a/src/common/rtc.cc
+++ b/src/common/rtc.cc
@@ -20,7 +20,7 @@
 #include <mxnet/rtc.h>
 #include <typeinfo>
 
-#include "../common/cuda_utils.h"
+#include "cuda/utils.h"
 #include "../operator/operator_common.h"
 
 #if MXNET_USE_CUDA
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index 42d03e55a275..da1e4bc436ab 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -29,7 +29,7 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 
 namespace mxnet {
 namespace engine {
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index e62351687083..3eda2c8712f7 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -30,7 +30,7 @@
 #include <mutex>
 #include <utility>
 #include "./threaded_engine.h"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 
 namespace mxnet {
 namespace engine {
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index c6eb99508e09..9d14ef191457 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -32,7 +32,7 @@
 #include "./thread_pool.h"
 #include "./stream_manager.h"
 #if MXNET_USE_CUDA
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #endif
 
 namespace mxnet {
diff --git a/src/kvstore/kvstore_nccl.h b/src/kvstore/kvstore_nccl.h
index e35f3a3da3fb..09bd880bfd68 100644
--- a/src/kvstore/kvstore_nccl.h
+++ b/src/kvstore/kvstore_nccl.h
@@ -38,7 +38,7 @@
 #include <tuple>
 #include "./comm.h"
 #include "./kvstore_local.h"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 
 // NCCL v2 introduces NCCL_MAJOR macro for versioning,
 // so if there is no such macro defined in nccl.h
diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu
index a3f99c1afdf0..e00b4c3f948e 100644
--- a/src/ndarray/ndarray_function.cu
+++ b/src/ndarray/ndarray_function.cu
@@ -31,7 +31,7 @@
 #include "../operator/tensor/init_op.h"
 #include "../operator/tensor/util/tensor_util-inl.h"
 #include "../operator/tensor/util/tensor_util-inl.cuh"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #include "./ndarray_function.h"
 #include "./ndarray_function-inl.h"
 #include "./ndarray_function-inl.cuh"
diff --git a/src/operator/bilinear_sampler.cu b/src/operator/bilinear_sampler.cu
index e8b1ce68847f..dae14a645fd8 100644
--- a/src/operator/bilinear_sampler.cu
+++ b/src/operator/bilinear_sampler.cu
@@ -26,7 +26,7 @@
 
 #include "./bilinear_sampler-inl.h"
 #include <algorithm>
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #if MXNET_USE_CUDNN == 1
 #include "./cudnn_bilinear_sampler-inl.h"
 #endif  // MXNET_USE_CUDNN
diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu
index 62680d1fb8d1..2206b5aa67b3 100644
--- a/src/operator/contrib/deformable_psroi_pooling.cu
+++ b/src/operator/contrib/deformable_psroi_pooling.cu
@@ -29,7 +29,7 @@
 #include <mshadow/cuda/reduce.cuh>
 #include <algorithm>
 #include <vector>
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 #include "../mxnet_op.h"
 
 #define DeformablePSROIPOOLING_CUDA_CHECK(condition) \
diff --git a/src/operator/contrib/nn/deformable_im2col.cuh b/src/operator/contrib/nn/deformable_im2col.cuh
index 9494fb379faf..8efee9979046 100644
--- a/src/operator/contrib/nn/deformable_im2col.cuh
+++ b/src/operator/contrib/nn/deformable_im2col.cuh
@@ -67,7 +67,7 @@
 #include <cstring>
 #include <vector>
 #include "../../mxnet_op.h"
-#include "../../../common/cuda_utils.h"
+#include "../../../common/cuda/utils.h"
 
 
 
diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu
index 8765eb95b72e..62ecd4ce8baa 100644
--- a/src/operator/contrib/psroi_pooling.cu
+++ b/src/operator/contrib/psroi_pooling.cu
@@ -30,7 +30,7 @@
 #include <mshadow/cuda/reduce.cuh>
 #include <algorithm>
 #include <vector>
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 #include "../mxnet_op.h"
 
 #define PSROIPOOLING_CUDA_CHECK(condition) \
diff --git a/src/operator/contrib/transformer.cu b/src/operator/contrib/transformer.cu
index bcbc18525c09..bfa4993e0d4b 100644
--- a/src/operator/contrib/transformer.cu
+++ b/src/operator/contrib/transformer.cu
@@ -30,7 +30,7 @@
 
 #include <mxnet/base.h>
 #include "./transformer-inl.h"
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h
index 58d0410a508e..06add1a39540 100644
--- a/src/operator/fusion/fused_op-inl.h
+++ b/src/operator/fusion/fused_op-inl.h
@@ -30,36 +30,6 @@ namespace mxnet {
 
 namespace fusion {
 
-const char fp16_support_string[] = R"code(
-struct __align__(2) __half {
-  __host__ __device__ __half() { }
-  unsigned short __x;
-};
-/* Definitions of intrinsics */
-__device__ inline __half __float2half(const float f) {
-  __half val;
- asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val.__x) : "f"(f));
-  return val;
-}
-__device__ inline float __half2float(const __half h) {
-  float val;
- asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h.__x));
-  return val;
-}
-
-typedef __half half;
-)code";
-
-const char type_support_string[] = R"code(
-using float32 = float;
-using float64 = double;
-using float16 = half;
-using uint8 = unsigned char;
-using int8 = char;
-using int32 = int;
-using int64 = long long;
-)code";
-
 const std::map<std::string, std::vector<std::vector<std::string>>> ops_desc = {
   {"elemwise_add"                      , {{"op::add(%, %)", "_0", "_1"}}},
   {"_plus"                             , {{"op::add(%, %)", "_0", "_1"}}},
@@ -247,772 +217,6 @@ const std::vector<std::string> variable_io_ops = {
   "_backward_cast"
 };
 
-const char function_definitions[] = R"code(
-
-#define INT_MAX (2147483647)
-
-namespace op {
-
-template <typename DType>
-struct LoadType {
-  using Type = DType;
-};
-
-template <>
-struct LoadType<half> {
-  using Type = float;
-};
-
-template <typename DType>
-__device__ inline typename LoadType<DType>::Type load(const DType input) {
-  return input;
-}
-
-template <>
-__device__ inline float load(const half input) {
-  return __half2float(input);
-}
-
-template <typename DType1, typename DType2>
-__device__ inline DType1 store(const DType2 input, DType1* ref) {
-  return input;
-}
-
-template <typename DType>
-__device__ inline half store(const DType input, half* ref) {
-  return __float2half(input);
-}
-
-template <int size>
-struct VectorConfig {
-    static_assert(size >= 4, "VectorConfig needs to have size of at least 4B");
-    using IndexType = float;
-};
-
-template <>
-struct VectorConfig<8> {
-    using IndexType = double;
-};
-
-template <>
-struct VectorConfig<16> {
-    using IndexType = double2;
-};
-
-template <>
-struct VectorConfig<32> {
-    using IndexType = double4;
-};
-
-template <typename DType>
-__device__ inline DType add_elem(const DType& x, const DType& y) {
-  return x + y;
-}
-
-template <>
-__device__ inline half add_elem(const half& x, const half& y) {
-  return __float2half(__half2float(x) + __half2float(y));
-}
-
-template <typename DType, int nvec>
-union VectorType {
-    typename VectorConfig<sizeof(DType)*nvec>::IndexType y;
-    DType x[nvec];
-    __device__ VectorType () {};
-    __device__ VectorType (const VectorType<DType, nvec>& y2) {
-        y = y2.y;
-    }
-    __device__ VectorType (const decltype(y) &y2) {
-        y = y2;
-    }
-    __device__ inline VectorType<DType, nvec>& operator+=(const VectorType<DType, nvec>& rhs) {
-      #pragma unroll
-      for (int i = 0; i < nvec; ++i) {
-        x[i] = add_elem(x[i], rhs.x[i]);
-      }
-      return *this;
-    }
-};
-
-template <int ndim>
-struct Shape {
-   int x[ndim];
-   size_t size;
-   __device__ inline const int& operator [](const int i) const {
-       return x[i];
-   }
-   __device__ inline int& operator [](const int i) {
-       return x[i];
-   }
-   __device__ inline void set(const int def) {
-       #pragma unroll
-       for (int i = 0; i < ndim; i++) {
-           x[i] = def;
-       }
-   }
-};
-
-template <>
-struct Shape<0> {
-   size_t size;
-};
-
-template <int nvec, typename DType, int ndim>
-__device__ inline VectorType<DType, nvec> load_index(const DType * input, int i,
-                                                     const Shape<ndim> &shape) {
-  if (i < shape.size) {
-    const auto* vector_input = reinterpret_cast<
-                                const typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(
-                                    input + i);
-    VectorType<DType, nvec> ret = {*vector_input};
-    return ret;
-  } else {
-    VectorType<DType, nvec> ret({0});
-    return ret;
-  }
-}
-
-template <int nvec, typename DType, int ndim>
-__device__ inline VectorType<DType, nvec> global_load_index(const DType * input, int i,
-                                                            const Shape<ndim> &shape) {
-  if (i < shape.size) {
-    const auto* vector_input = reinterpret_cast<
-                                const typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(
-                                    input + i);
-    VectorType<DType, nvec> ret = {__ldg(vector_input)};
-    return ret;
-  } else {
-    VectorType<DType, nvec> ret({0});
-    return ret;
-  }
-}
-
-template <int nvec, typename DType, int ndim>
-__device__ inline VectorType<DType, nvec> load_slice(const DType * input, const Shape<ndim>& shape,
-                                                     Shape<ndim> begin, Shape<ndim> end,
-                                                     int offset) {
-  int idx[nvec];
-
-  Shape<ndim> ref_strides;
-  Shape<ndim> strides;
-  ref_strides[ndim-1] = 1;
-  strides[ndim-1] = 1;
-  #pragma unroll
-  for (int dim = ndim-1; dim >=0; dim--) {
-    if (begin[dim] < 0) begin[dim] = shape[dim] + begin[dim];
-    if (end[dim] < 0) end[dim] = shape[dim] + end[dim];
-    if (end[dim] == INT_MAX) end[dim] = shape[dim];
-    if (dim > 0) {
-      ref_strides[dim-1] = ref_strides[dim] * (end[dim] - begin[dim]);
-      strides[dim-1] = strides[dim] * shape[dim];
-    }
-  }
-  #pragma unroll
-  for (int j = 0; j < nvec; j++) {
-    idx[j] = 0;
-    int ref_idx = offset + j;
-    #pragma unroll
-    for (int dim = 0; dim < ndim; dim++) {
-       int stride = ref_strides[dim];
-       if (shape[dim] > 1) {
-         idx[j] += (ref_idx / stride + begin[dim]) * strides[dim];
-       }
-       ref_idx = ref_idx % stride;
-    }
-  }
-  VectorType<DType, nvec> ret;
-  #pragma unroll
-  for (int j = 0; j < nvec; j++) {
-      ret.x[j] = *(input + idx[j]);
-  }
-  return ret;
-}
-
-template <int nvec, typename DType, int ndim>
-__device__ inline VectorType<DType, nvec> fast_load_slice(const DType * input,
-                                                          const Shape<ndim>& shape,
-                                                          Shape<ndim> begin,
-                                                          Shape<ndim> end,
-                                                          int offset) {
-  int idx = 0;
-
-  Shape<ndim> ref_strides;
-  Shape<ndim> strides;
-  ref_strides[ndim-1] = 1;
-  strides[ndim-1] = 1;
-  #pragma unroll
-  for (int dim = ndim-1; dim >=0; dim--) {
-    if (begin[dim] < 0) begin[dim] = shape[dim] + begin[dim];
-    if (end[dim] < 0) end[dim] = shape[dim] + end[dim];
-    if (end[dim] == INT_MAX) end[dim] = shape[dim];
-    if (dim > 0) {
-      ref_strides[dim-1] = ref_strides[dim] * (end[dim] - begin[dim]);
-      strides[dim-1] = strides[dim] * shape[dim];
-    }
-  }
-  int ref_idx = offset;
-  #pragma unroll
-  for (int dim = 0; dim < ndim; dim++) {
-     int stride = ref_strides[dim];
-     if (shape[dim] > 1) {
-       idx += (ref_idx / stride + begin[dim]) * strides[dim];
-     }
-     ref_idx = ref_idx % stride;
-  }
-  return global_load_index<nvec>(input, idx, shape);
-}
-
-template <int nvec, typename DType, int ndim>
-__device__ inline void store_index(const VectorType<DType, nvec> value, int i,
-                        DType * output, const Shape<ndim>& shape) {
-  if (i < (shape.size + nvec - 1) / nvec) {
-    auto vector_output = reinterpret_cast<
-                          typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(output);
-    vector_output[i] = value.y;
-  }
-}
-
-template <int nvec, typename DType, int ndim>
-__device__ inline void store_add_index(const VectorType<DType, nvec> value, int i,
-                            DType * output, const Shape<ndim>& shape) {
-  if (i < (shape.size + nvec - 1) / nvec) {
-    auto vector_output = reinterpret_cast<
-                          typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(output);
-    VectorType<DType, nvec> ret(vector_output[i]);
-    ret += value;
-    vector_output[i] = ret.y;
-  }
-}
-
-template <typename DType>
-__device__ inline DType identity(const DType val) {
-  return val;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType add(const DType a, const DType2 b) {
-  return a + b;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType sub(const DType a, const DType2 b) {
-  return a - b;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType mul(const DType a, const DType2 b) {
-  return a * b;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType div(const DType a, const DType2 b) {
-  return a / b;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType rdiv(const DType a, const DType2 b) {
-  return b / a;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType power(const DType a, const DType2 b) {
-  return powf(a, b);
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType rpow(const DType a, const DType2 b) {
-  return powf(b, a);
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType max(const DType a, const DType2 b) {
-  return a > b ? a : b;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType min(const DType a, const DType2 b) {
-  return a < b ? a : b;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType hypot(const DType a, const DType2 b) {
-  return hypotf(a, b);
-}
-
-template <typename OutType, typename DType>
-__device__ inline typename LoadType<OutType>::Type cast(const DType val) {
-  return static_cast<typename LoadType<OutType>::Type>(val);
-}
-
-// activations
-
-template <typename DType>
-__device__ inline DType relu(const DType val) {
-  return val > 0 ? val : 0;
-}
-
-const float SQRT_2 = 1.4142135623730950488016887242096;
-// compatible with mshadow_op.h version
-template <typename DType>
-__device__ inline DType gelu(const DType val) {
-  return DType(0.5f * static_cast<float>(val) *
-               (1.0f + erf(static_cast<float>(val) / SQRT_2)));
-}
-
-template <typename DType>
-__device__ inline DType sigmoid(const DType val) {
-  return 1.f/(1 + expf(-val));
-}
-
-template <typename DType>
-__device__ inline DType softrelu(const DType val) {
-  // Avoid overflow of exp for large inputs.
-  // The threshold 20 is chosen such that softrelu(a) = a
-  // for a > 20 using floating precision.
-  return val > 20 ? val : logf(1 + expf(val));
-}
-
-template <typename DType>
-__device__ inline DType softsign(const DType val) {
-  return val / (1 + fabsf(val));
-}
-
-// exp and log
-
-template <typename DType>
-__device__ inline DType exp(const DType val) {
-  return expf(val);
-}
-
-template <typename DType>
-__device__ inline DType expm1(const DType val) {
-  return expm1f(val);
-}
-
-template <typename DType>
-__device__ inline DType log(const DType val) {
-  return logf(val);
-}
-
-template <typename DType>
-__device__ inline DType log10(const DType val) {
-  return log10f(val);
-}
-
-template <typename DType>
-__device__ inline DType log2(const DType val) {
-  return log2f(val);
-}
-
-template <typename DType>
-__device__ inline DType log1p(const DType val) {
-  return log1pf(val);
-}
-
-// trigonometric
-
-constexpr double pi = 3.14159265358979323846;
-
-template <typename DType>
-__device__ inline DType degrees(const DType val) {
-  return (val / pi) * 180;
-}
-
-template <typename DType>
-__device__ inline DType radians(const DType val) {
-  return (val / 180.0) * pi;
-}
-
-template <typename DType>
-__device__ inline DType sin(const DType val) {
-  return sinf(val);
-}
-
-template <typename DType>
-__device__ inline DType cos(const DType val) {
-  return cosf(val);
-}
-
-template <typename DType>
-__device__ inline DType tan(const DType val) {
-  return tanf(val);
-}
-
-template <typename DType>
-__device__ inline DType arcsin(const DType val) {
-  return asinf(val);
-}
-
-template <typename DType>
-__device__ inline DType arccos(const DType val) {
-  return acosf(val);
-}
-
-template <typename DType>
-__device__ inline DType arctan(const DType val) {
-  return atanf(val);
-}
-
-template <typename DType>
-__device__ inline DType sinh(const DType val) {
-  return sinhf(val);
-}
-
-template <typename DType>
-__device__ inline DType cosh(const DType val) {
-  return coshf(val);
-}
-
-template <typename DType>
-__device__ inline DType tanh(const DType val) {
-  return tanhf(val);
-}
-
-template <typename DType>
-__device__ inline DType arcsinh(const DType val) {
-  return asinhf(val);
-}
-
-template <typename DType>
-__device__ inline DType arccosh(const DType val) {
-  return acoshf(val);
-}
-
-template <typename DType>
-__device__ inline DType arctanh(const DType val) {
-  return atanhf(val);
-}
-
-// sqrt
-
-template <typename DType>
-__device__ inline DType sqrt(const DType val) {
-  return sqrtf(val);
-}
-
-template <typename DType>
-__device__ inline DType rsqrt(const DType val) {
-  return rsqrtf(val);
-}
-
-template <typename DType>
-__device__ inline DType cbrt(const DType val) {
-  return cbrtf(val);
-}
-
-template <typename DType>
-__device__ inline DType rcbrt(const DType val) {
-  return rcbrtf(val);
-}
-
-template <typename DType>
-__device__ inline DType square(const DType val) {
-  return val * val;
-}
-
-template <typename DType>
-__device__ inline typename LoadType<DType>::Type zero(const DType val) {
-  return 0;
-}
-
-template <typename DType>
-__device__ inline typename LoadType<DType>::Type zero() {
-  return 0;
-}
-
-template <typename DType>
-__device__ inline typename LoadType<DType>::Type one(const DType val) {
-  return 1;
-}
-
-template <typename DType>
-__device__ inline typename LoadType<DType>::Type one() {
-  return 1;
-}
-
-template <typename DType>
-__device__ inline DType round(const DType val) {
-  return roundf(val);
-}
-
-template <typename DType>
-__device__ inline DType rint(const DType val) {
-  return rintf(val);
-}
-
-template <typename DType>
-__device__ inline DType fix(const DType val) {
-    const auto floor = floorf(val);
-    const auto ceil = ceilf(val);
-    return (floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil;
-}
-
-template <typename DType>
-__device__ inline DType floor(const DType val) {
-    return floorf(val);
-}
-
-template <typename DType>
-__device__ inline DType ceil(const DType val) {
-    return ceilf(val);
-}
-
-template <typename DType>
-__device__ inline DType trunc(const DType val) {
-    return truncf(val);
-}
-
-template <typename DType>
-__device__ inline DType clip(const DType val, const float a_min, const float a_max) {
-  return max(min(val, a_max), a_min);
-}
-
-template <typename DType>
-__device__ inline DType sign(const DType val) {
-  if (val < 0) return -1;
-  return val > 0 ? 1 : 0;
-}
-
-template <typename DType>
-__device__ inline DType reciprocal(const DType val) {
-  return 1.0f / val;
-}
-
-template <typename DType>
-__device__ inline DType abs(const DType val) {
-  return fabsf(val);
-}
-
-template <typename DType>
-__device__ inline DType gamma(const DType val) {
-  return tgammaf(val);
-}
-
-template <typename DType>
-__device__ inline DType gammaln(const DType val) {
-  return lgammaf(val);
-}
-
-template <typename DType>
-__device__ inline DType erf(const DType val) {
-  return erff(val);
-}
-
-template <typename DType>
-__device__ inline DType erfinv(const DType val) {
-  return erfinvf(val);
-}
-
-template <typename DType1, typename DType2>
-__device__ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) {
-  const auto bsq = scalar * scalar;
-  const auto ibsq = 1.0f / bsq;
-  if (val > ibsq) {
-    return val - 0.5f * ibsq;
-  } else if (val < -ibsq) {
-    return -val - 0.5f * ibsq;
-  } else {
-    return 0.5f * val * val * bsq;
-  }
-}
-
-}  // namespace op
-
-)code";
-
-const char backward_function_definitions[] = R"code(
-
-namespace op {
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_relu(const DType val, const DTypeGrad grad) {
-  return val > 0 ? grad : 0;
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sigmoid(const DType out, const DTypeGrad grad) {
-  return grad * out * (1 - out);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_softrelu(const DType val, const DTypeGrad grad) {
-  return grad * sigmoid(val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_softsign(const DType val, const DTypeGrad grad) {
-  const DType ap1 = 1 + fabsf(val);
-  return grad / (ap1 * ap1);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_exp(const DType val, const DTypeGrad grad) {
-  return grad * expf(val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_expm1(const DType val, const DTypeGrad grad) {
-  return grad * expf(val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log(const DType val, const DTypeGrad grad) {
-  return grad / val;
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log10(const DType val, const DTypeGrad grad) {
-  return grad / (val * logf(10));
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log2(const DType val, const DTypeGrad grad) {
-  return grad / (val * logf(2));
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log1p(const DType val, const DTypeGrad grad) {
-  return grad / (1 + val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sin(const DType val, const DTypeGrad grad) {
-  return grad * cosf(val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cos(const DType val, const DTypeGrad grad) {
-  return -grad * sinf(val);
-}
-
-// Uses output from tan
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_tan(const DType out, const DTypeGrad grad) {
-  return grad * (out * out + 1);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arcsin(const DType val, const DTypeGrad grad) {
-  return grad / sqrtf(1 - val*val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arccos(const DType val, const DTypeGrad grad) {
-  return -grad / sqrtf(1 - val*val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arctan(const DType val, const DTypeGrad grad) {
-  return grad / (1 + val*val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sinh(const DType val, const DTypeGrad grad) {
-  return grad * coshf(val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cosh(const DType val, const DTypeGrad grad) {
-  return grad * sinhf(val);
-}
-
-// Uses tanh output
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_tanh(const DType out, const DTypeGrad grad) {
-  return grad * (1 - out * out);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arcsinh(const DType val, const DTypeGrad grad) {
-  return grad / sqrtf(val * val + 1);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arccosh(const DType val, const DTypeGrad grad) {
-  return grad / sqrtf(val * val - 1);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arctanh(const DType val, const DTypeGrad grad) {
-  return grad / (1 - val * val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sqrt(const DType out, const DTypeGrad grad) {
-  return 0.5 * grad / out;
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_rsqrt(const DType val, const DTypeGrad grad) {
-  const DType inv = 1 / val;
-  return -0.5 * grad * sqrtf(inv) * inv;
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cbrt(const DType out, const DTypeGrad grad) {
-  return grad / (3.0f * out * out);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_rcbrt(const DType val, const DTypeGrad grad) {
-  const DType inv = 1 / val;
-  return -1.f/3.f * grad * cbrtf(inv) * inv;
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_square(const DType val, const DTypeGrad grad) {
-  return 2 * val * grad;
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_clip(const DType val, const DTypeGrad grad,
-                                          const float a_min, const float a_max) {
-  if (val > a_max || val < a_min) {
-    return 0;
-  } else {
-    return grad;
-  }
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_reciprocal(const DType val, const DTypeGrad grad) {
-  return -grad / (val * val);
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_erf(const DType val, const DTypeGrad grad) {
-  return 2.0f / sqrt(pi) * exp(-(val*val)) * grad;
-}
-
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_erfinv(const DType val, const DTypeGrad grad) {
-  return 0.5f * sqrt(pi) * exp(val * val) * grad;
-}
-
-template <typename DType, typename DType2, typename DTypeGrad>
-__device__ inline DTypeGrad backward_smooth_l1(const DType val, const DType2 scalar,
-                                               const DTypeGrad grad) {
-  auto bsq = scalar * scalar;
-  auto ibsq = 1.0f / bsq;
-  if (val > ibsq) {
-    return grad;
-  } else if (val < -ibsq) {
-    return -grad;
-  } else {
-    return bsq * val * grad;
-  }
-}
-
-// compatible with mshadow_op.h version
-template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_gelu(const DType val, const DTypeGrad grad) {
-  return grad * DType(0.5f * (1.0f + erf(static_cast<float>(val) / SQRT_2) +
-                static_cast<float>(val) * backward_erf(static_cast<float>(val) / SQRT_2, 1.0f) / SQRT_2));
-}
-
-}  // namespace op
-
-)code";
 
 const char kernel_begin[] = R"code(
 const int tid = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu
index 39b576b47a65..1863b4aae8ee 100644
--- a/src/operator/fusion/fused_op.cu
+++ b/src/operator/fusion/fused_op.cu
@@ -18,8 +18,6 @@
  */
 
 #include <sys/stat.h>
-#include <nvrtc.h>
-#include <cuda.h>
 #include <nnvm/pass_functions.h>
 #include <algorithm>
 #include <mutex>
@@ -28,7 +26,8 @@
 #include "../operator_common.h"
 #include "../elemwise_op_common.h"
 #include "../../imperative/exec_pass.h"
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
+#include "../../common/cuda/rtc.h"
 
 namespace mxnet {
 
@@ -167,30 +166,6 @@ void AddPointerAndShape(const TBlob& data,
   });
 }
 
-// Obtain compilation log from the program.
-std::string GetCompileLog(nvrtcProgram program) {
-  size_t log_size_including_null;
-  NVRTC_CALL(nvrtcGetProgramLogSize(program, &log_size_including_null));
-  // For most std::string implementations, this is probably 1 char bigger than needed.  OK though.
-  std::string log(log_size_including_null, '\0');
-  NVRTC_CALL(nvrtcGetProgramLog(program, &log[0]));
-  // Make sure the string reflects the true size (so minus the null terminator).
-  log.resize(log_size_including_null - 1);
-  return log;
-}
-
-// Obtain compilation result (ptx assembly) from the program.
-std::string GetPtx(nvrtcProgram program) {
-  size_t ptx_size_including_null;
-  NVRTC_CALL(nvrtcGetPTXSize(program, &ptx_size_including_null));
-  // For most std::string implementations, this is probably 1 char bigger than needed.  OK though.
-  std::string ptx(ptx_size_including_null, '\0');
-  NVRTC_CALL(nvrtcGetPTX(program, &ptx[0]));
-  // Make sure the string reflects the true size (so minus the null terminator).
-  ptx.resize(ptx_size_including_null - 1);
-  return ptx;
-}
-
 }  // namespace
 
 std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
@@ -592,86 +567,7 @@ std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
 CUfunction FusedOp::CompileCode(const std::string &code,
                                 const std::string &kernel_name,
                                 int dev_id) {
-  // Guard NVRTC calls
-  std::lock_guard<std::mutex> lock_nvrtc(mutex_);
-  // Local class for value type of compile cache
-  struct KernelInfo {
-    std::string mangled_name;
-    std::string ptx;
-    std::vector<CUfunction> functions;
-  };
-  // Maps from the cuda source code (minus header) to the ptx and jit-compiled CUfunctions.
-  using KernelCache = std::map<std::string, KernelInfo>;
-  // Per-gpu-architecture compiled kernel cache with jit-compiled function for each device context
-  static std::map<int32_t, KernelCache> compiled_kernels;
-  int sm_arch = SMArch(dev_id);
-  KernelCache& compiled_kernels_this_arch = compiled_kernels[sm_arch];  // make null map as needed
-  KernelInfo& kinfo = compiled_kernels_this_arch[code];                 // make KernelInfo as needed
-  if (kinfo.ptx.size() == 0) {
-    // It's the first time we've seen this kernel, so we need to generate the ptx and mangled_name.
-    static std::string common_header =
-        std::string(fusion::fp16_support_string) + "\n" +
-        fusion::type_support_string + "\n" +
-        fusion::function_definitions + "\n" +
-        fusion::backward_function_definitions + "\n";
-    std::string code_with_header = common_header + code;
-    // If verbose mode, output kernel source, though not including the common header
-    if (dmlc::GetEnv("MXNET_FUSION_VERBOSE", false)) {
-      LOG(INFO) << "\n" << std::string(80, '-') << "\n" << code;
-    }
-    if (compiled_kernels_this_arch.size() == CACHESIZE_WARN_THRESHOLD + 1 &&
-        dmlc::GetEnv("MXNET_FUSION_SIZE_WARNING", true)) {
-      LOG(WARNING) << "The number of different fused ops exceeds " << CACHESIZE_WARN_THRESHOLD
-                   << ".  Set MXNET_FUSION_SIZE_WARNING=0 to quiet this warning.";
-    }
-    nvrtcProgram program;
-    NVRTC_CALL(nvrtcCreateProgram(&program,                                  // prog
-                                  &code_with_header[0],                      // buffer
-                                  (kernel_name + "_kernel.cu").c_str(),      // name
-                                  0,                                         // num headers
-                                  nullptr,                                      // headers
-                                  nullptr));                                    // include names
-
-    std::string gpu_arch_arg = "--gpu-architecture=compute_" + std::to_string(sm_arch);
-    const char *opts[] = {gpu_arch_arg.c_str(),
-                          "--std=c++14"};
-    const std::string kernel_name_demangled = "FusedKernel_" + kernel_name;
-    NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
-
-    nvrtcResult compileResult = nvrtcCompileProgram(program,  // prog
-                                                    2,        // num options
-                                                    opts);    // options
-    CHECK_EQ(compileResult, NVRTC_SUCCESS)
-        << "NVRTC Compilation failed. Please set environment variable MXNET_USE_FUSION to 0.\n"
-        << GetCompileLog(program);
-
-    kinfo.ptx = GetPtx(program);
-    const char *mangled_name;
-    NVRTC_CALL(nvrtcGetLoweredName(program,
-                                   kernel_name_demangled.c_str(),
-                                   &mangled_name));
-    kinfo.mangled_name = mangled_name;
-    // Destroy the program.
-    NVRTC_CALL(nvrtcDestroyProgram(&program));
-  }
-  // Ensure function array is deep enough to index by dev_id
-  while (kinfo.functions.size() <= static_cast<size_t>(dev_id))
-    kinfo.functions.push_back(static_cast<CUfunction>(nullptr));
-  // Jit-compile ptx for the device as needed
-  if (kinfo.functions[dev_id] == static_cast<CUfunction>(nullptr)) {
-    // Make sure driver context is set to the proper device
-    CUdevice cu_device;
-    CUcontext context;
-    CUDA_DRIVER_CALL(cuDeviceGet(&cu_device, dev_id));
-    CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cu_device));
-    // Jit-compile ptx for the driver's current context
-    CUmodule module;
-    CUDA_DRIVER_CALL(cuModuleLoadData(&module, kinfo.ptx.c_str()));
-    CUDA_DRIVER_CALL(cuModuleGetFunction(&kinfo.functions[dev_id],
-                                         module,
-                                         kinfo.mangled_name.c_str()));
-  }
-  return kinfo.functions[dev_id];
+  return common::cuda::rtc::get_function(code, kernel_name, dev_id);
 }
 
 
@@ -776,7 +672,6 @@ void FusedOp::Forward<gpu>(const nnvm::NodeAttrs& attrs,
                <<  ", not expecting switch to device " << dev_id;
 
   Stream<gpu>* s = ctx.get_stream<gpu>();
-  auto stream = Stream<gpu>::GetStream(s);
   std::vector<void*> args;
   size_t N = 0;
   for (const auto& output : outputs) {
@@ -816,12 +711,10 @@ void FusedOp::Forward<gpu>(const nnvm::NodeAttrs& attrs,
           }
       }
   }
-  CUDA_DRIVER_CALL(
-      cuLaunchKernel(kernel_functions_[kernel_variant],
-        num_blocks, 1, 1,          // grid dim
-        FusedOp::NTHREADS, 1, 1,   // block dim
-        0, stream,                 // shared mem and stream
-        &(args[0]), 0));           // arguments
+  common::cuda::rtc::launch(kernel_functions_[kernel_variant],
+                            {num_blocks, 1, 1},
+                            {FusedOp::NTHREADS, 1, 1},
+                            0, s, &args);
 }
 
 void FusedOpForwardGPU(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index 104acd585bdb..9bdd54762878 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -29,7 +29,7 @@
 
 #include <algorithm>
 
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #include "mxnet_op.h"
 
 // Convenience functions.
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 8b7a38be3986..81cb4493798d 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -35,7 +35,7 @@
 #include "../engine/openmp.h"
 
 #ifdef __CUDACC__
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #endif  // __CUDACC__
 
 namespace mxnet {
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 0875f05e669d..1a5f83ec13d9 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -42,7 +42,6 @@
 #include "./cudnn/cudnn_batch_norm-inl.h"
 #endif
 
-#include "../../common/cuda_utils.h"
 #include "../../../include/mxnet/tensor_blob.h"
 
 using namespace mxnet;
diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h
index 186274b2f1e1..5ad0da3d5dea 100644
--- a/src/operator/nn/cudnn/cudnn_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_activation-inl.h
@@ -29,7 +29,7 @@
 #include <algorithm>
 #include <vector>
 #include "../activation-inl.h"
-#include "../../../common/cuda_utils.h"
+#include "../../../common/cuda/utils.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
index f7e01e214719..00939cfd8679 100644
--- a/src/operator/nn/cudnn/cudnn_algoreg-inl.h
+++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
@@ -32,7 +32,7 @@
 #include <vector>
 #include <functional>
 #include <utility>
-#include "../../../common/cuda_utils.h"
+#include "../../../common/cuda/utils.h"
 #include "../convolution-inl.h"
 #include "../deconvolution-inl.h"
 namespace mxnet {
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index 056f93b5237c..d7d0c39ffb88 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -33,7 +33,7 @@
 #include <string>
 #include "../convolution-inl.h"
 #include "./cudnn_algoreg-inl.h"
-#include "../../../common/cuda_utils.h"
+#include "../../../common/cuda/utils.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index b701883366ec..0fd88a8d9221 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -33,7 +33,7 @@
 #include <string>
 #include "../deconvolution-inl.h"
 #include "./cudnn_algoreg-inl.h"
-#include "../../../common/cuda_utils.h"
+#include "../../../common/cuda/utils.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/nn/depthwise_convolution-inl.h b/src/operator/nn/depthwise_convolution-inl.h
index 9db2650491a8..cd2cbd1c0788 100644
--- a/src/operator/nn/depthwise_convolution-inl.h
+++ b/src/operator/nn/depthwise_convolution-inl.h
@@ -27,7 +27,7 @@
 #include <algorithm>
 #include <vector>
 #include "./convolution-inl.h"
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 
 #if MXNET_USE_CUDA
 #include <cub/cub.cuh>
diff --git a/src/operator/nn/depthwise_convolution_tf.cuh b/src/operator/nn/depthwise_convolution_tf.cuh
index e59d8986b895..bb91ea9fb050 100644
--- a/src/operator/nn/depthwise_convolution_tf.cuh
+++ b/src/operator/nn/depthwise_convolution_tf.cuh
@@ -26,7 +26,7 @@
 */
 #ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
 #define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 #include "../mxnet_op.h"
 
 namespace tf {
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index e771b3681573..92d4e43d51ea 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -83,7 +83,7 @@
 #include "./pool_utils.h"
 #include "../mxnet_op.h"
 #include "../mshadow_op.h"
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 945decb9c632..8b7c02d574f1 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -34,7 +34,7 @@
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../tensor/broadcast_reduce_op.h"
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index ccfebf597f67..31c666307768 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -39,7 +39,7 @@
 #include <string>
 #include <vector>
 #include <algorithm>
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #include "../common/utils.h"
 
 namespace mxnet {
diff --git a/src/operator/pad.cu b/src/operator/pad.cu
index 643e62db722a..8d82ba337fdd 100644
--- a/src/operator/pad.cu
+++ b/src/operator/pad.cu
@@ -25,7 +25,7 @@
 */
 #include <algorithm>
 #include "./pad-inl.h"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 
 namespace mshadow {
 namespace cuda {
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 46bb5e79b262..d511791a82da 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -34,7 +34,7 @@
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #if MXNET_USE_CUDA
-#include "../../common/cuda_vectorization.cuh"
+#include "../../common/cuda/vectorization.cuh"
 #endif
 
 namespace mxnet {
diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh
index 0bb9fa636f45..f2e922e079ac 100644
--- a/src/operator/tensor/elemwise_binary_op.cuh
+++ b/src/operator/tensor/elemwise_binary_op.cuh
@@ -28,7 +28,7 @@
 
 #include <cuda_runtime.h>
 #include "../operator_common.h"
-#include "../../common/cuda_vectorization.cuh"
+#include "../../common/cuda/vectorization.cuh"
 
 #include <vector>
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh
index bbe606777dcc..497098d76ff9 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cuh
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh
@@ -28,7 +28,7 @@
 
 #include <cuda_runtime.h>
 #include "../operator_common.h"
-#include "../../common/cuda_vectorization.cuh"
+#include "../../common/cuda/vectorization.cuh"
 
 #include <vector>
 
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index 352c74ea9445..bc58834e5fc0 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -24,7 +24,7 @@
 */
 #include "./elemwise_sum.h"
 #include "../../ndarray/ndarray_function.h"
-#include "../../common/cuda_vectorization.cuh"
+#include "../../common/cuda/vectorization.cuh"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh
index 8688a8b8ac66..da611c82a537 100644
--- a/src/operator/tensor/elemwise_unary_op.cuh
+++ b/src/operator/tensor/elemwise_unary_op.cuh
@@ -28,7 +28,7 @@
 
 #include <cuda_runtime.h>
 #include "../operator_common.h"
-#include "../../common/cuda_vectorization.cuh"
+#include "../../common/cuda/vectorization.cuh"
 
 #include <vector>
 
diff --git a/src/operator/tensor/pseudo2DTranspose_op-inl.cuh b/src/operator/tensor/pseudo2DTranspose_op-inl.cuh
index 5898c0bcf07c..c89fe2e2b959 100644
--- a/src/operator/tensor/pseudo2DTranspose_op-inl.cuh
+++ b/src/operator/tensor/pseudo2DTranspose_op-inl.cuh
@@ -32,7 +32,7 @@
 #include <mshadow/base.h>
 #include <algorithm>
 #include <utility>
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 
 
 namespace mxnet {
diff --git a/src/profiler/profiler.cc b/src/profiler/profiler.cc
index 13ab462ab69c..69990e8282b0 100644
--- a/src/profiler/profiler.cc
+++ b/src/profiler/profiler.cc
@@ -31,7 +31,7 @@
 #include "./profiler.h"
 
 #if MXNET_USE_CUDA
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #endif
 
 #if defined(_MSC_VER) && _MSC_VER <= 1800
diff --git a/src/resource.cc b/src/resource.cc
index 9f5ecaf89b27..bb01acfac7b9 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -33,7 +33,7 @@
 #include <atomic>
 #include "./common/lazy_alloc_array.h"
 #include "./common/utils.h"
-#include "./common/cuda_utils.h"
+#include "./common/cuda/utils.h"
 #include "./profiler/storage_profiler.h"
 
 namespace mxnet {
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 3eabe1b502be..ad5d61352b46 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -27,7 +27,7 @@
 
 #include "mxnet/base.h"
 #include "mxnet/storage.h"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #include "../profiler/storage_profiler.h"
 #if MXNET_USE_CUDA
 #include <cuda_runtime.h>
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index 5d03fd13a3c9..31140daf2840 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -29,7 +29,7 @@
 #include <dmlc/logging.h>
 #include "mxnet/base.h"
 #include "mxnet/storage.h"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #include "../profiler/storage_profiler.h"
 
 namespace mxnet {
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index c4f32b22bdc2..769830b90c74 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -37,7 +37,7 @@
 #include <mutex>
 #include <new>
 #include "./storage_manager.h"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 #include "../common/utils.h"
 #include "../profiler/storage_profiler.h"
 

From 5d01ad204ae6dea2c985eecb9d45dec9b88b7d20 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 22 May 2020 09:50:55 -0700
Subject: [PATCH 04/64] Unary ops via RTC

---
 src/common/cuda/rtc.cc                        |  55 ++-
 src/common/cuda/rtc.h                         |  10 +-
 src/common/cuda/rtc/backward_functions-inl.h  |  36 +-
 src/common/cuda/rtc/forward_functions-inl.h   | 290 ++++++------
 src/common/cuda/rtc/half-inl.h                |  28 ++
 src/common/cuda/rtc/type-inl.h                |  62 ++-
 src/common/cuda/rtc/vectorization-inl.h       | 448 ++++++++++++++++++
 src/common/cuda/vectorization.cuh             |   6 +-
 src/libinfo.cc                                |   1 -
 src/operator/contrib/stes_op.cu               |   8 +-
 src/operator/fusion/fused_op-inl.h            |   1 +
 src/operator/fusion/fused_op.cu               |   4 +-
 src/operator/fusion/fused_op.h                |   1 -
 .../numpy/np_elemwise_unary_op_basic.cu       | 104 ++--
 src/operator/tensor/cast_storage-inl.h        |   2 +-
 src/operator/tensor/elemwise_unary_op.cuh     | 149 +++---
 src/operator/tensor/elemwise_unary_op.h       | 216 +++++----
 .../tensor/elemwise_unary_op_basic.cu         |  54 +--
 .../tensor/elemwise_unary_op_logexp.cu        |  16 +-
 src/operator/tensor/elemwise_unary_op_pow.cu  |  18 +-
 src/operator/tensor/elemwise_unary_op_trig.cu |  48 +-
 tests/python/gpu/test_fusion.py               |   1 +
 22 files changed, 1068 insertions(+), 490 deletions(-)
 create mode 100644 src/common/cuda/rtc/vectorization-inl.h

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 6d3bb62cd3b8..5900b31489da 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -25,12 +25,16 @@
 
 #include <mutex>
 #include <string>
+#include <fstream>
+#include <unordered_map>
+#include <vector>
 
 #include "rtc.h"
 #include "rtc/half-inl.h"
 #include "rtc/type-inl.h"
 #include "rtc/forward_functions-inl.h"
 #include "rtc/backward_functions-inl.h"
+#include "rtc/vectorization-inl.h"
 #include "utils.h"
 
 
@@ -41,6 +45,23 @@ namespace rtc {
 
 std::mutex lock;
 
+namespace util {
+
+std::string to_string(OpReqType req) {
+  switch (req) {
+    case kNullOp:
+      return "OpReqType::kNullOp";
+    case kWriteTo:
+    case kWriteInplace:
+      return "OpReqType::kWriteTo";
+    case kAddTo:
+      return "OpReqType::kAddTo";
+  }
+  LOG(FATAL) << "Unrecognized req.";
+}
+
+}  // namespace util
+
 namespace {
 
 // Obtain compilation log from the program.
@@ -81,9 +102,9 @@ CUfunction get_function(const std::string &code,
     std::vector<CUfunction> functions;
   };
   // Maps from the cuda source code (minus header) to the ptx and jit-compiled CUfunctions.
-  using KernelCache = std::map<std::string, KernelInfo>;
+  using KernelCache = std::unordered_map<std::string, KernelInfo>;
   // Per-gpu-architecture compiled kernel cache with jit-compiled function for each device context
-  static std::map<int32_t, KernelCache> compiled_kernels;
+  static std::unordered_map<int32_t, KernelCache> compiled_kernels;
   int sm_arch = SMArch(dev_id);
   KernelCache& compiled_kernels_this_arch = compiled_kernels[sm_arch];  // make null map as needed
   KernelInfo& kinfo = compiled_kernels_this_arch[code];                 // make KernelInfo as needed
@@ -92,8 +113,10 @@ CUfunction get_function(const std::string &code,
     static std::string common_header =
         std::string(fp16_support_string) + "\n" +
         type_support_string + "\n" +
+        op_req_type_string + "\n" +
         function_definitions + "\n" +
-        backward_function_definitions + "\n";
+        backward_function_definitions + "\n" +
+        vectorization_support_string + "\n";
     std::string code_with_header = common_header + code;
     // If verbose mode, output kernel source, though not including the common header
     if (dmlc::GetEnv("MXNET_RTC_VERBOSE", false)) {
@@ -101,10 +124,15 @@ CUfunction get_function(const std::string &code,
     }
     if (compiled_kernels_this_arch.size() == CACHESIZE_WARN_THRESHOLD + 1 &&
         dmlc::GetEnv("MXNET_RTC_SIZE_WARNING", true)) {
-      LOG(WARNING) << "The number of different compiled kernels exceeds " << CACHESIZE_WARN_THRESHOLD
+      LOG(WARNING) << "The number of different compiled kernels exceeds "
+                   << CACHESIZE_WARN_THRESHOLD
                    << ".  Set MXNET_RTC_SIZE_WARNING=0 to quiet this warning.";
     }
     nvrtcProgram program;
+    std::ofstream f("debug.log");
+    f << code_with_header;
+    f.close();
+
     NVRTC_CALL(nvrtcCreateProgram(&program,                                  // prog
                                   &code_with_header[0],                      // buffer
                                   (kernel_name + "_kernel.cu").c_str(),      // name
@@ -115,7 +143,7 @@ CUfunction get_function(const std::string &code,
     std::string gpu_arch_arg = "--gpu-architecture=compute_" + std::to_string(sm_arch);
     const char *opts[] = {gpu_arch_arg.c_str(),
                           "--std=c++11"};
-    const std::string kernel_name_demangled = "FusedKernel_" + kernel_name;
+    const std::string kernel_name_demangled = kernel_name;
     NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
 
     nvrtcResult compileResult = nvrtcCompileProgram(program,  // prog
@@ -159,14 +187,17 @@ void launch(CUfunction function,
             const dim3 block_dim,
             unsigned int shared_mem_bytes,
             mshadow::Stream<gpu> *stream,
-            std::vector<void*> *args) {
+            std::vector<const void*> *args) {
+  CHECK(args->size() != 0) <<
+    "Empty argument list passed to a kernel.";
   CUDA_DRIVER_CALL(
-      cuLaunchKernel(function,                   // function to launch
-        grid_dim.x, grid_dim.y, grid_dim.z,      // grid dim
-        block_dim.x, block_dim.y, block_dim.z,   // block dim
-        shared_mem_bytes,                        // shared memory
-        mshadow::Stream<gpu>::GetStream(stream), // stream
-        &((*args)[0]), nullptr));                // arguments
+      cuLaunchKernel(function,                    // function to launch
+        grid_dim.x, grid_dim.y, grid_dim.z,       // grid dim
+        block_dim.x, block_dim.y, block_dim.z,    // block dim
+        shared_mem_bytes,                         // shared memory
+        mshadow::Stream<gpu>::GetStream(stream),  // stream
+        const_cast<void**>(args->data()),         // arguments
+        nullptr));
 }
 
 }  // namespace rtc
diff --git a/src/common/cuda/rtc.h b/src/common/cuda/rtc.h
index 811936915d31..799fbacf698c 100644
--- a/src/common/cuda/rtc.h
+++ b/src/common/cuda/rtc.h
@@ -28,6 +28,7 @@
 #define MXNET_COMMON_CUDA_RTC_H_
 
 #include "mxnet/base.h"
+#include "mxnet/op_attr_types.h"
 
 #if MXNET_USE_CUDA
 
@@ -36,12 +37,19 @@
 
 #include <mutex>
 #include <string>
+#include <vector>
 
 namespace mxnet {
 namespace common {
 namespace cuda {
 namespace rtc {
 
+namespace util {
+
+std::string to_string(OpReqType req);
+
+}
+
 extern std::mutex lock;
 
 CUfunction get_function(const std::string &code,
@@ -53,7 +61,7 @@ void launch(CUfunction function,
             const dim3 block_dim,
             unsigned int shared_mem_bytes,
             mshadow::Stream<gpu> *stream,
-            std::vector<void*> *args);
+            std::vector<const void*> *args);
 
 }  // namespace rtc
 }  // namespace cuda
diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
index b5ae8a361a3b..72e70cb60ded 100644
--- a/src/common/cuda/rtc/backward_functions-inl.h
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -33,7 +33,7 @@ namespace op {
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_relu(const DType val, const DTypeGrad grad) {
-  return val > 0 ? grad : 0;
+  return (isnan(val) || val > 0) ? grad : 0;
 }
 
 template <typename DType, typename DTypeGrad>
@@ -48,18 +48,18 @@ __device__ inline DTypeGrad backward_softrelu(const DType val, const DTypeGrad g
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_softsign(const DType val, const DTypeGrad grad) {
-  const DType ap1 = 1 + fabsf(val);
+  const DType ap1 = 1 + op::abs(val);
   return grad / (ap1 * ap1);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_exp(const DType val, const DTypeGrad grad) {
-  return grad * expf(val);
+  return grad * op::exp(val);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_expm1(const DType val, const DTypeGrad grad) {
-  return grad * expf(val);
+  return backward_exp(val, grad);
 }
 
 template <typename DType, typename DTypeGrad>
@@ -69,12 +69,12 @@ __device__ inline DTypeGrad backward_log(const DType val, const DTypeGrad grad)
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_log10(const DType val, const DTypeGrad grad) {
-  return grad / (val * logf(10));
+  return grad / (val * op::log(static_cast<DTypeGrad>(10)));
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_log2(const DType val, const DTypeGrad grad) {
-  return grad / (val * logf(2));
+  return grad / (val * op::log(static_cast<DTypeGrad>(2)));
 }
 
 template <typename DType, typename DTypeGrad>
@@ -84,12 +84,12 @@ __device__ inline DTypeGrad backward_log1p(const DType val, const DTypeGrad grad
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_sin(const DType val, const DTypeGrad grad) {
-  return grad * cosf(val);
+  return grad * op::cos(val);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_cos(const DType val, const DTypeGrad grad) {
-  return -grad * sinf(val);
+  return -grad * op::sin(val);
 }
 
 // Uses output from tan
@@ -100,12 +100,12 @@ __device__ inline DTypeGrad backward_tan(const DType out, const DTypeGrad grad)
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_arcsin(const DType val, const DTypeGrad grad) {
-  return grad / sqrtf(1 - val*val);
+  return grad / op::sqrt(1 - val*val);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_arccos(const DType val, const DTypeGrad grad) {
-  return -grad / sqrtf(1 - val*val);
+  return -grad / op::sqrt(1 - val*val);
 }
 
 template <typename DType, typename DTypeGrad>
@@ -115,12 +115,12 @@ __device__ inline DTypeGrad backward_arctan(const DType val, const DTypeGrad gra
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_sinh(const DType val, const DTypeGrad grad) {
-  return grad * coshf(val);
+  return grad * op::cosh(val);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_cosh(const DType val, const DTypeGrad grad) {
-  return grad * sinhf(val);
+  return grad * op::sinh(val);
 }
 
 // Uses tanh output
@@ -131,12 +131,12 @@ __device__ inline DTypeGrad backward_tanh(const DType out, const DTypeGrad grad)
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_arcsinh(const DType val, const DTypeGrad grad) {
-  return grad / sqrtf(val * val + 1);
+  return grad / op::sqrt(val * val + 1);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_arccosh(const DType val, const DTypeGrad grad) {
-  return grad / sqrtf(val * val - 1);
+  return grad / op::sqrt(val * val - 1);
 }
 
 template <typename DType, typename DTypeGrad>
@@ -152,7 +152,7 @@ __device__ inline DTypeGrad backward_sqrt(const DType out, const DTypeGrad grad)
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_rsqrt(const DType val, const DTypeGrad grad) {
   const DType inv = 1 / val;
-  return -0.5 * grad * sqrtf(inv) * inv;
+  return -0.5 * grad * op::sqrt(inv) * inv;
 }
 
 template <typename DType, typename DTypeGrad>
@@ -163,7 +163,7 @@ __device__ inline DTypeGrad backward_cbrt(const DType out, const DTypeGrad grad)
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_rcbrt(const DType val, const DTypeGrad grad) {
   const DType inv = 1 / val;
-  return -1.f/3.f * grad * cbrtf(inv) * inv;
+  return -1.f/3.f * grad * op::cbrt(inv) * inv;
 }
 
 template <typename DType, typename DTypeGrad>
@@ -188,12 +188,12 @@ __device__ inline DTypeGrad backward_reciprocal(const DType val, const DTypeGrad
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_erf(const DType val, const DTypeGrad grad) {
-  return 2.0f / sqrt(pi) * exp(-(val*val)) * grad;
+  return 2.0f / op::sqrt(pi) * op::exp(-(val*val)) * grad;
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_erfinv(const DType val, const DTypeGrad grad) {
-  return 0.5f * sqrt(pi) * exp(val * val) * grad;
+  return 0.5f * op::sqrt(pi) * op::exp(val * val) * grad;
 }
 
 template <typename DType, typename DType2, typename DTypeGrad>
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 548d2392a393..756c5e5efa01 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -269,6 +269,11 @@ __device__ inline DType identity(const DType val) {
   return val;
 }
 
+template <typename DType>
+__device__ inline DType negation(const DType val) {
+  return -val;
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType add(const DType a, const DType2 b) {
   return a + b;
@@ -294,30 +299,38 @@ __device__ inline DType rdiv(const DType a, const DType2 b) {
   return b / a;
 }
 
-template <typename DType, typename DType2>
-__device__ inline DType power(const DType a, const DType2 b) {
-  return powf(a, b);
+#define DEFINE_BINARY_MATH_FUNC(name, double_version, float_version) \
+template <typename DType, typename DType2> \
+__device__ inline DType name (const DType a, const DType2 b) { \
+  if (type_util::has_double_or_integral<DType, DType2>::value) { \
+    return double_version ((double)a, (double)b); \
+  } else { \
+    return float_version (a, b); \
+  } \
 }
 
+DEFINE_BINARY_MATH_FUNC(power, ::pow, ::powf)
+
 template <typename DType, typename DType2>
 __device__ inline DType rpow(const DType a, const DType2 b) {
-  return powf(b, a);
+  return power(b, a);
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType max(const DType a, const DType2 b) {
+  if (isnan(a)) return a;
   return a > b ? a : b;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType min(const DType a, const DType2 b) {
+  if (isnan(a)) return a;
   return a < b ? a : b;
 }
 
-template <typename DType, typename DType2>
-__device__ inline DType hypot(const DType a, const DType2 b) {
-  return hypotf(a, b);
-}
+DEFINE_BINARY_MATH_FUNC(hypot, ::hypot, ::hypotf)
+
+#undef DEFINE_BINARY_MATH_FUNC
 
 template <typename OutType, typename DType>
 __device__ inline typename LoadType<OutType>::Type cast(const DType val) {
@@ -328,55 +341,54 @@ __device__ inline typename LoadType<OutType>::Type cast(const DType val) {
 
 template <typename DType>
 __device__ inline DType relu(const DType val) {
-  return val > 0 ? val : 0;
+  return (isnan(val) || val > 0) ? val : 0;
 }
 
 template <typename DType>
 __device__ inline DType sigmoid(const DType val) {
-  return 1.f/(1 + expf(-val));
+  if (type_util::has_double_or_integral<DType>::value) {
+    return 1./(1 + ::exp(-val));
+  } else {
+    return 1.f/(1 + expf(-val));
+  }
 }
 
 template <typename DType>
 __device__ inline DType softrelu(const DType val) {
-  return logf(1 + expf(val));
+  if (type_util::has_double_or_integral<DType>::value) {
+    return ::log(1 + ::exp(val));
+  } else {
+    return logf(1 + expf(val));
+  }
 }
 
 template <typename DType>
 __device__ inline DType softsign(const DType val) {
-  return val / (1 + fabsf(val));
+  if (type_util::has_double_or_integral<DType>::value) {
+    return val / (1 + fabs(val));
+  } else {
+    return val / (1 + fabsf(val));
+  }
 }
 
 // exp and log
 
-template <typename DType>
-__device__ inline DType exp(const DType val) {
-  return expf(val);
-}
-
-template <typename DType>
-__device__ inline DType expm1(const DType val) {
-  return expm1f(val);
-}
-
-template <typename DType>
-__device__ inline DType log(const DType val) {
-  return logf(val);
+#define DEFINE_UNARY_MATH_FUNC(name, double_version, float_version) \
+template <typename DType> \
+__device__ inline DType name (const DType a) { \
+  if (type_util::has_double_or_integral<DType>::value) { \
+    return double_version ((double)a); \
+  } else { \
+    return float_version (a); \
+  } \
 }
 
-template <typename DType>
-__device__ inline DType log10(const DType val) {
-  return log10f(val);
-}
-
-template <typename DType>
-__device__ inline DType log2(const DType val) {
-  return log2f(val);
-}
-
-template <typename DType>
-__device__ inline DType log1p(const DType val) {
-  return log1pf(val);
-}
+DEFINE_UNARY_MATH_FUNC(exp, ::exp, ::expf)
+DEFINE_UNARY_MATH_FUNC(expm1, ::expm1, ::expm1f)
+DEFINE_UNARY_MATH_FUNC(log, ::log, ::logf)
+DEFINE_UNARY_MATH_FUNC(log10, ::log10, ::log10f)
+DEFINE_UNARY_MATH_FUNC(log2, ::log2, ::log2f)
+DEFINE_UNARY_MATH_FUNC(log1p, ::log1p, ::log1pf)
 
 // trigonometric
 
@@ -384,95 +396,42 @@ constexpr double pi = 3.14159265358979323846;
 
 template <typename DType>
 __device__ inline DType degrees(const DType val) {
-  return (val / pi) * 180;
+  if (type_util::has_double_or_integral<DType>::value) {
+    return (val / pi) * 180;
+  } else {
+    return (val / static_cast<float>(pi)) * 180.f;
+  }
 }
 
 template <typename DType>
 __device__ inline DType radians(const DType val) {
-  return (val / 180.0) * pi;
-}
-
-template <typename DType>
-__device__ inline DType sin(const DType val) {
-  return sinf(val);
-}
-
-template <typename DType>
-__device__ inline DType cos(const DType val) {
-  return cosf(val);
-}
-
-template <typename DType>
-__device__ inline DType tan(const DType val) {
-  return tanf(val);
-}
-
-template <typename DType>
-__device__ inline DType arcsin(const DType val) {
-  return asinf(val);
-}
-
-template <typename DType>
-__device__ inline DType arccos(const DType val) {
-  return acosf(val);
-}
-
-template <typename DType>
-__device__ inline DType arctan(const DType val) {
-  return atanf(val);
-}
-
-template <typename DType>
-__device__ inline DType sinh(const DType val) {
-  return sinhf(val);
-}
-
-template <typename DType>
-__device__ inline DType cosh(const DType val) {
-  return coshf(val);
-}
-
-template <typename DType>
-__device__ inline DType tanh(const DType val) {
-  return tanhf(val);
-}
-
-template <typename DType>
-__device__ inline DType arcsinh(const DType val) {
-  return asinhf(val);
+  if (type_util::has_double_or_integral<DType>::value) {
+    return (val / 180.0) * pi;
+  } else {
+    return (val / 180.0f) * static_cast<float>(pi);
+  }
 }
 
-template <typename DType>
-__device__ inline DType arccosh(const DType val) {
-  return acoshf(val);
-}
+DEFINE_UNARY_MATH_FUNC(sin, ::sin, ::sinf)
+DEFINE_UNARY_MATH_FUNC(cos, ::cos, ::cosf)
+DEFINE_UNARY_MATH_FUNC(tan, ::tan, ::tanf)
+DEFINE_UNARY_MATH_FUNC(arcsin, ::asin, ::asinf)
+DEFINE_UNARY_MATH_FUNC(arccos, ::acos, ::acosf)
+DEFINE_UNARY_MATH_FUNC(arctan, ::atan, ::atanf)
 
-template <typename DType>
-__device__ inline DType arctanh(const DType val) {
-  return atanhf(val);
-}
+DEFINE_UNARY_MATH_FUNC(sinh, ::sinh, ::sinhf)
+DEFINE_UNARY_MATH_FUNC(cosh, ::cosh, ::coshf)
+DEFINE_UNARY_MATH_FUNC(tanh, ::tanh, ::tanhf)
+DEFINE_UNARY_MATH_FUNC(arcsinh, ::asinh, ::asinhf)
+DEFINE_UNARY_MATH_FUNC(arccosh, ::acosh, ::acoshf)
+DEFINE_UNARY_MATH_FUNC(arctanh, ::atanh, ::atanhf)
 
 // sqrt
 
-template <typename DType>
-__device__ inline DType sqrt(const DType val) {
-  return sqrtf(val);
-}
-
-template <typename DType>
-__device__ inline DType rsqrt(const DType val) {
-  return rsqrtf(val);
-}
-
-template <typename DType>
-__device__ inline DType cbrt(const DType val) {
-  return cbrtf(val);
-}
-
-template <typename DType>
-__device__ inline DType rcbrt(const DType val) {
-  return rcbrtf(val);
-}
+DEFINE_UNARY_MATH_FUNC(sqrt, ::sqrt, ::sqrtf)
+DEFINE_UNARY_MATH_FUNC(rsqrt, ::rsqrt, ::rsqrtf)
+DEFINE_UNARY_MATH_FUNC(cbrt, ::cbrt, ::cbrtf)
+DEFINE_UNARY_MATH_FUNC(rcbrt, ::rcbrt, ::rcbrtf)
 
 template <typename DType>
 __device__ inline DType square(const DType val) {
@@ -501,34 +460,64 @@ __device__ inline typename LoadType<DType>::Type one() {
 
 template <typename DType>
 __device__ inline DType round(const DType val) {
-  return roundf(val);
+  if (type_util::has_double<DType>::value) {
+    return ::round((double)val);
+  } else if (type_util::is_integral<DType>::value) {
+    return val;
+  } else {
+    return ::roundf(val);
+  }
 }
 
 template <typename DType>
-__device__ inline DType rint(const DType val) {
-  return rintf(val);
+__device__ inline DType floor(const DType val) {
+  if (type_util::has_double<DType>::value) {
+    return ::floor((double)val);
+  } else if (type_util::is_integral<DType>::value) {
+    return val;
+  } else {
+    return ::floorf(val);
+  }
 }
 
 template <typename DType>
-__device__ inline DType fix(const DType val) {
-    const auto floor = floorf(val);
-    const auto ceil = ceilf(val);
-    return (floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil;
+__device__ inline DType ceil(const DType val) {
+  if (type_util::has_double<DType>::value) {
+    return ::ceil((double)val);
+  } else if (type_util::is_integral<DType>::value) {
+    return val;
+  } else {
+    return ::ceilf(val);
+  }
 }
 
 template <typename DType>
-__device__ inline DType floor(const DType val) {
-    return floorf(val);
+__device__ inline DType rint(const DType val) {
+  if (type_util::has_double<DType>::value) {
+    return ::rint((double)val);
+  } else if (type_util::is_integral<DType>::value) {
+    return val;
+  } else {
+    return ::rintf(val);
+  }
 }
 
 template <typename DType>
-__device__ inline DType ceil(const DType val) {
-    return ceilf(val);
+__device__ inline DType fix(const DType val) {
+  const auto f = floor(val);
+  const auto c = ceil(val);
+  return (f > 0 ? f : -f) < (c > 0 ? c : -c) ? f : c;
 }
 
 template <typename DType>
 __device__ inline DType trunc(const DType val) {
-    return truncf(val);
+  if (type_util::has_double<DType>::value) {
+    return ::trunc((double)val);
+  } else if (type_util::is_integral<DType>::value) {
+    return val;
+  } else {
+    return ::truncf(val);
+  }
 }
 
 template <typename DType>
@@ -547,30 +536,11 @@ __device__ inline DType reciprocal(const DType val) {
   return 1.0f / val;
 }
 
-template <typename DType>
-__device__ inline DType abs(const DType val) {
-  return fabsf(val);
-}
-
-template <typename DType>
-__device__ inline DType gamma(const DType val) {
-  return tgammaf(val);
-}
-
-template <typename DType>
-__device__ inline DType gammaln(const DType val) {
-  return lgammaf(val);
-}
-
-template <typename DType>
-__device__ inline DType erf(const DType val) {
-  return erff(val);
-}
-
-template <typename DType>
-__device__ inline DType erfinv(const DType val) {
-  return erfinvf(val);
-}
+DEFINE_UNARY_MATH_FUNC(abs, ::fabs, ::fabsf)
+DEFINE_UNARY_MATH_FUNC(gamma, ::tgamma, ::tgammaf)
+DEFINE_UNARY_MATH_FUNC(gammaln, ::lgamma, ::lgammaf)
+DEFINE_UNARY_MATH_FUNC(erf, ::erf, ::erff)
+DEFINE_UNARY_MATH_FUNC(erfinv, ::erfinv, ::erfinvf)
 
 template <typename DType1, typename DType2>
 __device__ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) {
@@ -585,6 +555,18 @@ __device__ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) {
   }
 }
 
+template <typename DType>
+__device__ inline DType logical_not(const DType val) {
+  return val != DType(0) ? DType(0) : DType(1);
+}
+
+template <typename DType>
+__device__ inline bool np_logical_not(const DType val) {
+  return !static_cast<bool>(val);
+}
+
+#undef DEFINE_UNARY_MATH_FUNC
+
 }  // namespace op
 
 )code";
diff --git a/src/common/cuda/rtc/half-inl.h b/src/common/cuda/rtc/half-inl.h
index a467a5efdfa1..b3306b295637 100644
--- a/src/common/cuda/rtc/half-inl.h
+++ b/src/common/cuda/rtc/half-inl.h
@@ -45,6 +45,34 @@ __device__ inline float __half2float(const __half h) {
 }
 
 typedef __half half;
+
+template <typename DType>
+struct AccType {
+  using type = DType;
+
+  __device__ static inline type from(const DType& val) {
+    return val;
+  }
+
+  __device__ static inline DType to(type val) {
+    return val;
+  }
+
+};
+
+template<>
+struct AccType<half> {
+  using type = float;
+
+  __device__ static inline type from(const half& val) {
+    return __half2float(val);
+  }
+
+  __device__ static inline half to(type val) {
+    return __float2half(val);
+  }
+};
+
 )code";
 
 }  // namespace rtc
diff --git a/src/common/cuda/rtc/type-inl.h b/src/common/cuda/rtc/type-inl.h
index 927a8f43443a..9165926ac1e8 100644
--- a/src/common/cuda/rtc/type-inl.h
+++ b/src/common/cuda/rtc/type-inl.h
@@ -20,6 +20,8 @@
 #ifndef MXNET_COMMON_CUDA_RTC_TYPE_INL_H_
 #define MXNET_COMMON_CUDA_RTC_TYPE_INL_H_
 
+#include <mxnet/base.h>
+
 #if MXNET_USE_CUDA
 
 namespace mxnet {
@@ -35,8 +37,66 @@ using uint8 = unsigned char;
 using int8 = char;
 using int32 = int;
 using int64 = long long;
-)code";
 
+namespace type_util {
+
+struct false_type {
+  static constexpr bool value = false;
+};
+
+struct true_type {
+  static constexpr bool value = true;
+};
+
+// is_integral
+template <typename T> struct is_integral : false_type {};
+template <> struct is_integral<uint8> : true_type {};
+template <> struct is_integral<int8>  : true_type {};
+template <> struct is_integral<int32> : true_type {};
+template <> struct is_integral<int64> : true_type {};
+template <> struct is_integral<bool>  : true_type {};
+
+// is_same
+template <typename T, typename U>
+struct is_same : false_type {};
+template <typename T> struct is_same<T, T> : true_type {};
+
+// has_double
+template <typename... T> struct has_double : false_type {};
+
+template <typename A, typename... B>
+struct has_double<A, B...> {
+    static constexpr bool value = is_same<A, double>::value ||
+                                  has_double<B...>::value;
+};
+
+// has_double_or_integral
+template <typename... T> struct has_double_or_integral : false_type {};
+
+template <typename A, typename... B>
+struct has_double_or_integral<A, B...> {
+    static constexpr bool value = is_same<A, double>::value ||
+                                  is_integral<A>::value ||
+                                  has_double_or_integral<B...>::value;
+};
+
+
+}  // namespace type_util
+)code"
+#if MSHADOW_INT64_TENSOR_SIZE == 1
+"typedef int64 index_t;\n";
+#else
+"typedef int32 index_t;\n";
+#endif
+
+const char op_req_type_string[] = R"code(
+enum class OpReqType {
+  kNullOp,
+  kWriteTo,
+  kWriteInplace,
+  kAddTo
+};
+)code";
 }  // namespace rtc
 }  // namespace cuda
 }  // namespace common
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
new file mode 100644
index 000000000000..c3bc6881de3d
--- /dev/null
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -0,0 +1,448 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_COMMON_CUDA_RTC_VECTORIZATION_INL_H_
+#define MXNET_COMMON_CUDA_RTC_VECTORIZATION_INL_H_
+
+#include <mxnet/base.h>
+
+#if MXNET_USE_CUDA
+
+#include <sstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include "../rtc.h"
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+const char vectorization_support_string[] = R"code(
+
+namespace vector {
+
+template <int size>
+struct VectorType {
+    static_assert(size <= 32, "VectorType needs to have size of at most 32B");
+};
+
+template <>
+struct VectorType<1> {
+  using type = char1;
+};
+
+template <>
+struct VectorType<2> {
+  using type = short1;
+};
+
+
+template <>
+struct VectorType<4> {
+  using type = uint1;
+};
+
+template <>
+struct VectorType<8> {
+  using type = ulong1;
+};
+
+template <>
+struct VectorType<16> {
+  using type = ulong2;
+};
+
+template <>
+struct VectorType<32> {
+  using type = ulong4;
+};
+
+/* \brief Helper class that enables storing multiple values of type DType
+          as 1 value of type LType.
+*/
+template <typename DType, int n>
+class VectorizedStorage {
+ public:
+  using LType = typename VectorType<sizeof(DType) * n>::type;
+  constexpr static int nvec = n;
+  union vectorized_storage {
+    LType aligned;
+    DType separate[nvec];  // NOLINT(*)
+
+    inline __device__ vectorized_storage() {}
+    inline __device__ ~vectorized_storage() {}
+  } scratch_;
+};
+
+// Returns const LType is DType is const
+template <typename DType, typename LType>
+struct select_const {
+  using type = LType;
+};
+
+template <typename DType, typename LType>
+struct select_const<const DType, LType> {
+  using type = const LType;
+};
+
+template <typename DType>
+struct remove_const {
+  using type = DType;
+};
+
+template <typename DType>
+struct remove_const<const DType> {
+  using type = DType;
+};
+
+
+/* \brief Helper class that enables accessing multiple values of type DType
+          as 1 value of type LType. Additional aligned template argument
+          allows performance optimizations if the pointer and the size of
+          the allocation is aligned to sizeof(LType) / sizeof(DType) elements.
+*/
+template <typename DType, int nvec, bool aligned = false>
+class VectorizedAccessor {
+ public:
+  using StorageType = VectorizedStorage<typename remove_const<DType>::type,
+                                        nvec>;
+  using LType = typename select_const<DType, typename StorageType::LType>::type;
+  StorageType storage_;
+
+  LType* aligned_ptr_;
+  DType* unaligned_ptr_;
+  int alignment_;
+  index_t n_elems_;
+
+  inline __device__ VectorizedAccessor(DType* const ptr, const index_t size) {
+    unaligned_ptr_ = ptr;
+    if (aligned) {
+      alignment_ = 0;
+      aligned_ptr_ = reinterpret_cast<LType*>(ptr);
+      n_elems_ = (size + nvec- 1) / nvec;
+    } else {
+      size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
+      alignment_ = (ptr_as_number % sizeof(LType)) / sizeof(DType);
+      aligned_ptr_ = reinterpret_cast<LType*>(ptr - alignment_);
+      n_elems_ = (size + alignment_ + nvec - 1) / nvec;
+    }
+  }
+
+  /* \brief Alignment of the input pointer in elements. */
+  inline __device__ int alignment() const {
+    return alignment_;
+  }
+
+  /* \brief Access to separate elements. */
+  inline __device__ DType* separate() {
+    return storage_.scratch_.separate;
+  }
+
+  /* \brief Number of aligned elements that span the entire input tensor. */
+  inline __device__ index_t num_aligned_elements() const {
+    return n_elems_;
+  }
+
+  /* \brief Load values from the input.
+     \param id Aligned index of the element.
+     \param N size of the tensor.
+  */
+  inline __device__ void load(const index_t id, const index_t N) {
+    if (aligned) {
+      storage_.scratch_.aligned = aligned_ptr_[id];
+    } else {
+      if (id > 0 && id < n_elems_ - 1) {
+        storage_.scratch_.aligned = aligned_ptr_[id];
+      } else {
+#pragma unroll
+        for (int j = 0; j < nvec; ++j) {
+          DType* ptr = reinterpret_cast<DType*>(&(aligned_ptr_[id])) + j;
+          if (reinterpret_cast<size_t>(ptr) >= reinterpret_cast<size_t>(unaligned_ptr_) &&
+              reinterpret_cast<size_t>(ptr) < reinterpret_cast<size_t>(unaligned_ptr_ + N)) {
+            storage_.scratch_.separate[j] = *ptr;
+          }
+        }
+      }
+    }
+  }
+};
+
+/* \brief Class used for vectorized read-only access. */
+template <typename DType, int nvec, bool aligned = false>
+class VectorizedLoader : public VectorizedAccessor<const DType, nvec, aligned> {
+ public:
+  inline __device__ VectorizedLoader(const DType* ptr, const index_t N) :
+    VectorizedAccessor<const DType, nvec, aligned>(ptr, N) {
+  }
+};
+
+/* \brief Class used for vectorized writable access. */
+template <typename DType, int nvec, bool aligned = false>
+class VectorizedStorer : public VectorizedAccessor<DType, nvec, aligned> {
+ public:
+  inline __device__ VectorizedStorer(DType* ptr, const index_t N) :
+    VectorizedAccessor<DType, nvec, aligned>(ptr, N) {
+  }
+
+  /* \brief Store values to the output.
+     \param id Aligned index of the element.
+     \param N size of the tensor.
+  */
+  inline __device__ void store(const index_t id, const index_t N) {
+    if (aligned) {
+      this->aligned_ptr_[id] = this->storage_.scratch_.aligned;
+    } else {
+      if (id > 0 && id < this->n_elems_ - 1) {
+        this->aligned_ptr_[id] = this->storage_.scratch_.aligned;
+      } else {
+#pragma unroll
+        for (int j = 0; j < nvec; ++j) {
+          DType* ptr = reinterpret_cast<DType*>(&(this->aligned_ptr_[id])) + j;
+          if (reinterpret_cast<size_t>(ptr) >= reinterpret_cast<size_t>(this->unaligned_ptr_) &&
+              reinterpret_cast<size_t>(ptr) < reinterpret_cast<size_t>(this->unaligned_ptr_ + N)) {
+            *ptr = this->storage_.scratch_.separate[j];
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace vector
+
+)code";
+
+namespace {
+
+index_t get_num_aligned_elements(const void *ptr, const index_t lead_dim,
+                                 const int nvec, const int size) {
+  size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
+  int alignment = (ptr_as_number % (nvec * size)) / size;
+  return (lead_dim + alignment + nvec - 1) / nvec;
+}
+
+struct TypeInfo {
+  std::string name;
+  int size;
+
+  TypeInfo(const std::string name, const int size) :
+    name(std::move(name)), size(size) {}
+};
+
+TypeInfo mshadow_type_info(int type_flag) {
+  using namespace mshadow;
+  switch (type_flag) {
+    case kFloat32:
+      return TypeInfo("float32", sizeof(float));
+    case kFloat64:
+      return TypeInfo("float64", sizeof(double));
+    case kFloat16:
+      return TypeInfo("float16", 2);
+    case kUint8:
+      return TypeInfo("uint8", sizeof(uint8_t));
+    case kInt32:
+      return TypeInfo("int32", sizeof(int32_t));
+    case kInt8:
+      return TypeInfo("int8", sizeof(int8_t));
+    case kInt64:
+      return TypeInfo("int64", sizeof(int64_t));
+    case kBool:
+      return TypeInfo("bool", sizeof(bool));
+    default:
+      LOG(FATAL) << "Unknown type flag " << type_flag;
+      return TypeInfo("INVALID", 1);
+  }
+}
+
+enum class Alignment {
+  SAME_ALIGNED,  // All tensors aligned
+  SAME_UNALIGNED,  // All tensors have the same misalignment
+  DIFFERENT  // Tensors have different alignment
+};
+
+int CalcAlignment(const void *ptr, const int size) {
+  size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
+  return ptr_as_number % size;
+}
+
+/* \brief Check alignment of the inputs and outputs when cast to LType*.
+   \param params Structure containing arrays with inputs' and outputs' pointers
+   \param lead_dim Leading dimension of the tensors.
+   \param other_dim The size of the other dimensions of the tensors.
+*/
+template <typename Params>
+Alignment CheckAlignment(const Params& params, const index_t lead_dim,
+                         const index_t other_dim, const int nvec,
+                         const std::vector<TBlob> &inputs,
+                         const std::vector<TBlob> &outputs) {
+  int align = -1;
+
+  size_t i = 0;
+  for (const void *ptr : params.inputs) {
+    int new_align = CalcAlignment(ptr,
+                                  mshadow_type_info(inputs[i].type_flag_).size * nvec);
+    if (align == -1) {
+      align = new_align;
+    } else {
+      if (align != new_align) {
+        return Alignment::DIFFERENT;
+      }
+    }
+    ++i;
+  }
+
+  i = 0;
+  for (const void *ptr : params.outputs) {
+    int new_align = CalcAlignment(ptr,
+                                  mshadow_type_info(outputs[i].type_flag_).size * nvec);
+    if (align == -1) {
+      align = new_align;
+    } else {
+      if (align != new_align) {
+        return Alignment::DIFFERENT;
+      }
+    }
+    ++i;
+  }
+
+  if ((other_dim != 1) &&
+      (lead_dim % nvec != 0)) {
+    return Alignment::DIFFERENT;
+  }
+
+  if ((align == 0) &&
+      (lead_dim % nvec == 0)) {
+    return Alignment::SAME_ALIGNED;
+  } else {
+    return Alignment::SAME_UNALIGNED;
+  }
+}
+
+constexpr int vectorized_kernel_thread_num = 512;
+
+}  // namespace
+
+template <typename Params>
+void VectorizedKernelRTCLauncher(const std::string &code,
+                                 const std::string &kernel_name,
+                                 const int nvec,
+                                 const index_t lead_dim,
+                                 const index_t other_dim,
+                                 mshadow::Stream<gpu> *s,
+                                 const Params params,
+                                 const std::vector<TBlob> &inputs,
+                                 const std::vector<TBlob> &outputs,
+                                 const int dev_id) {
+  const index_t N = lead_dim * other_dim;
+  if (N != 0) {
+    index_t num_aligned_elements = get_num_aligned_elements(params.inputs[0], lead_dim, nvec,
+                                                            mshadow_type_info(inputs[0].type_flag_)
+                                                            .size);
+    size_t num_elements = other_dim * num_aligned_elements;
+    constexpr int threads = vectorized_kernel_thread_num;
+    constexpr int max_blocks = 65535;
+    index_t blocks = std::min(static_cast<int>((num_elements + threads - 1) / threads),
+                              max_blocks);
+    auto align = CheckAlignment(params, lead_dim, other_dim,
+                                nvec, inputs, outputs);
+    std::stringstream kernel_builder;
+
+    // Fill input types
+    int counter = 0;
+    for (const auto& input : inputs) {
+      const auto& type_info = mshadow_type_info(input.type_flag_);
+      kernel_builder << "using InputType"
+                     << counter
+                     << " = "
+                     << type_info.name
+                     << ";"
+                     << std::endl;
+      ++counter;
+    }
+
+    // Fill output types
+    counter = 0;
+    for (const auto& output : outputs) {
+      const auto& type_info = mshadow_type_info(output.type_flag_);
+      kernel_builder << "using OutputType"
+                     << counter
+                     << " = "
+                     << type_info.name
+                     << ";"
+                     << std::endl;
+      ++counter;
+    }
+
+    switch (align) {
+      case Alignment::SAME_ALIGNED:
+        kernel_builder << "const bool aligned = true;"
+                       << std::endl
+                       << "const int nvec = "
+                       << nvec
+                       << ";"
+                       << std::endl;
+        break;
+      case Alignment::SAME_UNALIGNED:
+        kernel_builder << "const bool aligned = false;"
+                       << std::endl
+                       << "const int nvec = "
+                       << nvec
+                       << ";"
+                       << std::endl;
+        break;
+      case Alignment::DIFFERENT: {
+        num_aligned_elements = lead_dim * other_dim;
+        blocks = std::min(static_cast<int>((num_aligned_elements + threads - 1) / threads),
+                          max_blocks);
+        // If the pointers are aligned differently we cannot vectorize
+        kernel_builder << "const bool aligned = true;"
+                       << std::endl
+                       << "const int nvec = 1;"
+                       << std::endl;
+        break;
+      }
+    }
+
+    kernel_builder << code;
+
+    std::vector<const void*> args = {&params, &lead_dim, &other_dim,
+                                     &N, &num_aligned_elements};
+    auto function = common::cuda::rtc::get_function(kernel_builder.str(),
+                                                    kernel_name,
+                                                    dev_id);
+
+    common::cuda::rtc::launch(function,
+                              {static_cast<unsigned int>(blocks), 1, 1},
+                              {threads, 1, 1},
+                              0, s, &args);
+  }
+}
+
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#endif  // MXNET_COMMON_CUDA_RTC_VECTORIZATION_INL_H_
diff --git a/src/common/cuda/vectorization.cuh b/src/common/cuda/vectorization.cuh
index 56253d3535be..bc7840a7014a 100644
--- a/src/common/cuda/vectorization.cuh
+++ b/src/common/cuda/vectorization.cuh
@@ -26,7 +26,9 @@
 #ifndef MXNET_COMMON_CUDA_VECTORIZATION_CUH_
 #define MXNET_COMMON_CUDA_VECTORIZATION_CUH_
 
-#if MXNET_USE_CUDA && __CUDACC__
+#include <mxnet/base.h>
+
+#if MXNET_USE_CUDA
 
 #include <cuda_runtime.h>
 #include "utils.h"
@@ -278,6 +280,6 @@ void VectorizedKernelLauncher(const index_t lead_dim,
 }  // namespace common
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA && __CUDACC__
+#endif  // MXNET_USE_CUDA
 
 #endif  // MXNET_COMMON_CUDA_VECTORIZATION_CUH_
diff --git a/src/libinfo.cc b/src/libinfo.cc
index dd535093f0a0..f7b8f80a08e5 100644
--- a/src/libinfo.cc
+++ b/src/libinfo.cc
@@ -133,7 +133,6 @@ const std::vector<std::string> EnumNames::names = {
   "CUDA",
   "CUDNN",
   "NCCL",
-  "CUDA_RTC",
   "TENSORRT",
   "CPU_SSE",
   "CPU_SSE2",
diff --git a/src/operator/contrib/stes_op.cu b/src/operator/contrib/stes_op.cu
index 85e3ddaf206f..5ce947900899 100644
--- a/src/operator/contrib/stes_op.cu
+++ b/src/operator/contrib/stes_op.cu
@@ -31,13 +31,13 @@ namespace op {
 
 // Round STE
 NNVM_REGISTER_OP(_contrib_round_ste)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::round>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::round>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"round"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"round"});
 
 // Sign STE
 NNVM_REGISTER_OP(_contrib_sign_ste)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sign>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::sign>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sign"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sign"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h
index 06add1a39540..ea2303e3aa1f 100644
--- a/src/operator/fusion/fused_op-inl.h
+++ b/src/operator/fusion/fused_op-inl.h
@@ -129,6 +129,7 @@ const std::map<std::string, std::vector<std::vector<std::string>>> ops_desc = {
   {"negative"                          , {{"(-%)", "_0"}}},
   {"_hypot"                            , {{"op::hypot(%, %)", "_0", "_1"}}},
   {"_hypot_scalar"                     , {{"op::hypot(%, float(%))", "_0", "scalar"}}},
+  {"logical_not"                       , {{"op::logical_not(%)", "_0"}}},
   {"_backward_relu"                    , {{"op::backward_relu(%, %)", "_1", "_0"}}},
   {"_backward_sigmoid"                 , {{"op::backward_sigmoid(%, %)", "_1", "_0"}}},
   {"_backward_expm1"                   , {{"op::backward_expm1(%, %)", "_1", "_0"}}},
diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu
index 1863b4aae8ee..23d6b231630f 100644
--- a/src/operator/fusion/fused_op.cu
+++ b/src/operator/fusion/fused_op.cu
@@ -567,7 +567,7 @@ std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
 CUfunction FusedOp::CompileCode(const std::string &code,
                                 const std::string &kernel_name,
                                 int dev_id) {
-  return common::cuda::rtc::get_function(code, kernel_name, dev_id);
+  return common::cuda::rtc::get_function(code, "FusedKernel_" + kernel_name, dev_id);
 }
 
 
@@ -672,7 +672,7 @@ void FusedOp::Forward<gpu>(const nnvm::NodeAttrs& attrs,
                <<  ", not expecting switch to device " << dev_id;
 
   Stream<gpu>* s = ctx.get_stream<gpu>();
-  std::vector<void*> args;
+  std::vector<const void*> args;
   size_t N = 0;
   for (const auto& output : outputs) {
     N = std::max(N, output.shape_.Size());
diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h
index f4cb0600c80f..bb13309bc9cb 100644
--- a/src/operator/fusion/fused_op.h
+++ b/src/operator/fusion/fused_op.h
@@ -58,7 +58,6 @@ struct FusedOpEntry {
 class FusedOp {
  public:
   static const int NTHREADS = 512;
-  static const int CACHESIZE_WARN_THRESHOLD = 10000;
 
   explicit FusedOp(const nnvm::NodeAttrs* attrs, const FusedOpConfig& config);
   ~FusedOp() {}
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index bce6335a7384..025abc8233e8 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -27,108 +27,94 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(_npx_relu)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::relu>);
+#define MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(__name$, __kernel$)       \
+  NNVM_REGISTER_OP(__name$)                                               \
+  .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{#__kernel$})
 
-NNVM_REGISTER_OP(_npx_sigmoid)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sigmoid>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npx_relu, relu);
+
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npx_sigmoid, sigmoid);
 
 NNVM_REGISTER_OP(_np_copy)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
-#define MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(__name$, __kernel$)       \
-  NNVM_REGISTER_OP(__name$)                                               \
-  .set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, __kernel$>)
-
-#define MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(__name$, __kernel$)  \
-  NNVM_REGISTER_OP(__name$)                                                     \
-  .set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeMixedType<gpu, __kernel$>)
-
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_negative, mshadow_op::negation);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_negative, negation);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_reciprocal, mshadow_op::reciprocal);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_reciprocal, reciprocal);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_absolute, mshadow_op::abs);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_absolute, abs);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sign, mshadow_op::sign);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sign, sign);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_rint, mshadow_op::rint);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_rint, rint);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_ceil, mshadow_op::ceil);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_ceil, ceil);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_floor, mshadow_op::floor);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_floor, floor);
 
-NNVM_REGISTER_OP(_npi_bitwise_not)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeInt<gpu, mshadow_op::bitwise_not>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_bitwise_not, bitwise_not);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_trunc, mshadow_op::trunc);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_trunc, trunc);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_fix, mshadow_op::fix);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_fix, fix);
 
-MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_square, mshadow_op::square);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_square, square);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_sqrt, mshadow_op::square_root);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sqrt, sqrt);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_cbrt, mshadow_op::cube_root);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_cbrt, cbrt);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_exp, mshadow_op::exp);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_exp, exp);
 
-NNVM_REGISTER_OP(_npi_log)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeMixedType<gpu, mshadow_op::log>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_log, log);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_log10, mshadow_op::log10);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_log10, log10);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_log2, mshadow_op::log2);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_log2, log2);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_log1p, mshadow_op::log1p);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_log1p, log1p);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_expm1, mshadow_op::expm1);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_expm1, expm1);
 
-NNVM_REGISTER_OP(_npi_logical_not)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeLogic<gpu, mshadow_op::np_logical_not>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_logical_not, np_logical_not);
 
-NNVM_REGISTER_OP(_npi_isnan)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeLogic<gpu, mshadow_op::isnan>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_isnan, isnan);
 
-NNVM_REGISTER_OP(_npi_isinf)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeLogic<gpu, mshadow_op::isinf>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_isinf, isinf);
 
-NNVM_REGISTER_OP(_npi_isposinf)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeLogic<gpu, mshadow_op::isposinf>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_isposinf, isposinf);
 
-NNVM_REGISTER_OP(_npi_isneginf)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeLogic<gpu, mshadow_op::isneginf>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_isneginf, isneginf);
 
-NNVM_REGISTER_OP(_npi_isfinite)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::ComputeLogic<gpu, mshadow_op::isfinite>);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_isfinite, isfinite);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_sin, mshadow_op::sin);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sin, sin);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_cos, mshadow_op::cos);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_cos, cos);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_tan, mshadow_op::tan);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_tan, tan);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_arcsin, mshadow_op::arcsin);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arcsin, arcsin);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_arccos, mshadow_op::arccos);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arccos, arccos);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_arctan, mshadow_op::arctan);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arctan, arctan);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_degrees, mshadow_op::degrees);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_degrees, degrees);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_radians, mshadow_op::radians);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_radians, radians);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_sinh, mshadow_op::sinh);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_sinh, sinh);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_cosh, mshadow_op::cosh);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_cosh, cosh);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_tanh, mshadow_op::tanh);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_tanh, tanh);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_arcsinh, mshadow_op::arcsinh);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arcsinh, arcsinh);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_arccosh, mshadow_op::arccosh);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arccosh, arccosh);
 
-MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY_GPU(_npi_arctanh, mshadow_op::arctanh);
+MXNET_OPERATOR_REGISTER_NUMPY_UNARY_GPU(_npi_arctanh, arctanh);
 
 NNVM_REGISTER_OP(_npi_around)
 .set_attr<FCompute>("FCompute<gpu>", AroundOpForward<gpu>);
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
index c42590fd0848..e8fec3081c0a 100644
--- a/src/operator/tensor/cast_storage-inl.h
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -30,7 +30,7 @@
 #include <algorithm>
 #include "../mxnet_op.h"
 #include "../operator_common.h"
-#include "../../src/operator/tensor/init_op.h"
+#include "./init_op.h"
 #ifdef __CUDACC__
 #include "./cast_storage-inl.cuh"
 #endif  // __CUDACC__
diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh
index da611c82a537..891cba0c129c 100644
--- a/src/operator/tensor/elemwise_unary_op.cuh
+++ b/src/operator/tensor/elemwise_unary_op.cuh
@@ -29,6 +29,8 @@
 #include <cuda_runtime.h>
 #include "../operator_common.h"
 #include "../../common/cuda/vectorization.cuh"
+#include "../../common/cuda/rtc/vectorization-inl.h"
+#include "../../common/cuda/rtc.h"
 
 #include <vector>
 
@@ -37,89 +39,124 @@
 namespace mxnet {
 namespace op {
 
-namespace unary {
+struct unary_kernel_params {
+  const void *inputs[1];
+  void *outputs[1];
+};
 
-using common::cuda::VectorizedKernelLauncher;
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
+const char unary_kernel_fwd[] = R"code(
 
-template <typename DType, int NumInputs, int NumOutputs>
-struct VectorizedKernelParams {
-  const DType* inputs[NumInputs];
-  DType* outputs[NumOutputs];
+struct unary_kernel_params {
+  const void *inputs[1];
+  void *outputs[1];
 };
 
-template <bool aligned, typename DType, typename LType, typename OP, int req>
-__global__ void VectorizedUnaryScalarKernelFwd(const VectorizedKernelParams<DType, 1, 1> params,
-                                               const index_t N) {
-  VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], N);
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
+__global__ void unary_kernel(const unary_kernel_params params,
+                             const index_t lead_dim,
+                             const index_t other_dim,
+                             const index_t N,
+                             const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using IType = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
 
-  const index_t M = loader.num_aligned_elements();
+  const index_t M = num_aligned_elements;
 
   for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
        tid < M;
        tid += gridDim.x * blockDim.x) {
     loader.load(tid, N);
-    if (req == kAddTo) {
+    if (req == OpReqType::kAddTo) {
       storer.load(tid, N);
     }
 #pragma unroll
-    for (int i = 0; i < loader.nvec(); ++i) {
-      DType temp = OP::Map(loader.separate()[i]);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
+    for (int i = 0; i < nvec; ++i) {
+      const auto input = IType::from(loader.separate()[i]);
+      const auto temp = OP(input);  // enables returning different type
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
       } else {
-        storer.separate()[i] = temp;
+        storer.separate()[i] = OType::to(temp);
       }
     }
     storer.store(tid, N);
   }
 }
 
-template <typename DType, typename OP, int req>
-class VectorizedUnaryScalarFwd {
- public:
-  using ParamType = VectorizedKernelParams<DType, 1, 1>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedUnaryScalarKernelFwd<aligned, DType, LType, OP, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
+)code";
+
+struct UnaryRTCCompute {
 
-}  // namespace unary
+  std::string OP;
 
-template<typename OP>
-void UnaryOp::Compute_(const nnvm::NodeAttrs& attrs,
-                     mshadow::Stream<gpu>* s,
-                     const std::vector<TBlob>& inputs,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<TBlob>& outputs) {
-  using namespace unary;
+void operator()(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
   if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint4;
-      using Kernel = VectorizedUnaryScalarFwd<DType, OP, Req>;
-
-      const index_t size = outputs[0].Size();
-      typename Kernel::ParamType params;
-      params.inputs[0] = inputs[0].dptr<DType>();
-      params.outputs[0] = outputs[0].dptr<DType>();
-
-      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-    });
-  });
+
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n" +
+                           "#define OP op::" +
+                           OP +
+                           "\n" +
+                           unary_kernel_fwd;
+  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+  const index_t size = outputs[0].Size();
+  unary_kernel_params params = { {inputs[0].dptr_},
+                                 {outputs[0].dptr_} };
+
+  VectorizedKernelRTCLauncher(code, "unary_kernel", nvec,
+                              size, 1, s, params,
+                              inputs, outputs,
+                              ctx.run_ctx.get_ctx().dev_id);
 }
 
+void operator()(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<NDArray>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<NDArray>& outputs) {
+    InitStorageGeometry<1, 1>(attrs, inputs, outputs);
+    CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+      << "This function works only for sparse types.";
+    CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type())
+      << "The storage type of both inputs and outputs needs to be the same.";
+    AllocateGeometry(&outputs[0], req[0], &inputs[0]);
+    CopyGeometryBlobs<gpu>(ctx.get_stream<gpu>(), &outputs[0], req[0], inputs[0]);
+    outputs[0].CheckAndAllocData(inputs[0].storage_shape());
+    if (inputs[0].storage_shape().Size()) {
+      std::vector<TBlob> in_blobs, out_blobs;
+      in_blobs.reserve(inputs.size());
+      out_blobs.reserve(outputs.size());
+      for (auto &input : inputs) {
+        in_blobs.emplace_back(input.data());
+      }
+      for (auto &output : outputs) {
+        out_blobs.emplace_back(output.data());
+      }
+      this->operator()(attrs, ctx, in_blobs, req, out_blobs);
+    }
+}
+
+};
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 8067491ded2a..d9634c252eb1 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -45,69 +45,117 @@
 namespace mxnet {
 namespace op {
 
-class OpBase {
- protected:
-  /*! \brief simple kernel to set to a scalar value of arbitrary type */
-  template<int req>
-  using set_to_scalar = mxnet_op::op_with_req<mshadow_op::identity, req>;
+namespace {
 
-  /*! \brief Copy blob data */
-  template<typename xpu>
-  static void inline CopyBlob(mshadow::Stream<xpu> *s,
-                              const TBlob *dest_blob,
-                              const OpReqType reqi,
-                              const TBlob& src_blob) {
-    CHECK_EQ(src_blob.type_flag_, dest_blob->type_flag_);
-    CHECK_EQ(src_blob.shape_, dest_blob->shape_);
-    MSHADOW_TYPE_SWITCH(src_blob.type_flag_, DType, {
-      // Check if the pointers are the same (in-place operation needs no copy)
-      if (reqi != kNullOp && src_blob.dptr<DType>() != dest_blob->dptr<DType>()) {
-        mshadow::Copy(dest_blob->FlatTo1D<xpu, DType>(s), src_blob.FlatTo1D<xpu, DType>(s), s);
+/*! \brief Infer the output storage geometry
+ * \return boolean signifying whether the proper storage geometry was initialized
+ */
+template<int n_in, int n_out>
+bool InitStorageGeometry(const nnvm::NodeAttrs& attrs,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), static_cast<size_t>(n_in))
+    << " in operator " << attrs.name;
+  CHECK_EQ(outputs.size(), static_cast<size_t>(n_out))
+    << " in operator " << attrs.name;
+  static_assert(n_in > 0 && n_out > 0, "Invalid input and/or output count values");
+  const mxnet::TShape& isshape = inputs[0].storage_shape();
+  if (!shape_is_none(isshape)) {
+    NDArray *output = nullptr;
+    for (size_t i = 0, n = inputs.size(); i < n; ++i) {
+      const NDArray &input = inputs[i];
+      if (i < n_out) {
+        output = const_cast<NDArray *>(&outputs[i]);
       }
-    });
+      CHECK_EQ(output->shape(), inputs[i].shape());
+      CHECK_EQ(output->storage_type(), input.storage_type());
+      CHECK_EQ(output->aux_shapes().size(), input.aux_shapes().size());
+      mxnet::ShapeVector aux_shapes;
+      const size_t aux_shape_count = input.aux_shapes().size();
+      aux_shapes.reserve(aux_shape_count);
+      for (size_t j = 0; j < aux_shape_count; ++j) {
+        aux_shapes.emplace_back(input.aux_shape(j));
+      }
+      output->CheckAndAlloc(aux_shapes);
+      DCHECK_EQ(output->storage_shape(), input.storage_shape());
+    }
+    return true;
+  }
+  if (isshape.ndim() > 0 && !isshape.Size()
+    && inputs[0].storage_type() != kDefaultStorage) {
+    return true;  // 0% density
+  } else {
+    CHECK(false);  // implement when necessary
   }
+  return false;
+}
 
-  /*! \brief Allocate geometry-related blob data for sparse tensors
-   * \param dest Destination sparse NDArray
-   * \param clone_from sparse NDArray from which to clone storage attributes
-   */
-  static void AllocateGeometry(const NDArray *dest,
-                               const OpReqType req,
-                               const NDArray* clone_from = nullptr) {
-    if (req != kNullOp) {
-      if (clone_from) {
-        const mxnet::TShape& ishape = clone_from->storage_shape();
-        dest->CheckAndAllocData(ishape);
-        CHECK_EQ(dest->storage_type(), clone_from->storage_type());
-        for (size_t i = 0, n = clone_from->aux_shapes().size(); i < n; ++i) {
-          dest->CheckAndAllocAuxData(i, clone_from->aux_shape(i));
-        }
-        DCHECK_EQ(dest->aux_shapes().size(), clone_from->aux_shapes().size());
-      } else {
-        for (size_t i = 0, n = dest->aux_shapes().size(); i < n; ++i) {
-          dest->CheckAndAllocAuxData(i, dest->aux_shape(i));
-        }
-        dest->CheckAndAllocData(dest->storage_shape());
+/*! \brief Copy blob data */
+template<typename xpu>
+void inline CopyBlob(mshadow::Stream<xpu> *s,
+                            const TBlob *dest_blob,
+                            const OpReqType reqi,
+                            const TBlob& src_blob) {
+  CHECK_EQ(src_blob.type_flag_, dest_blob->type_flag_);
+  CHECK_EQ(src_blob.shape_, dest_blob->shape_);
+  MSHADOW_TYPE_SWITCH(src_blob.type_flag_, DType, {
+    // Check if the pointers are the same (in-place operation needs no copy)
+    if (reqi != kNullOp && src_blob.dptr<DType>() != dest_blob->dptr<DType>()) {
+      mshadow::Copy(dest_blob->FlatTo1D<xpu, DType>(s), src_blob.FlatTo1D<xpu, DType>(s), s);
+    }
+  });
+}
+
+/*! \brief Allocate geometry-related blob data for sparse tensors
+ * \param dest Destination sparse NDArray
+ * \param clone_from sparse NDArray from which to clone storage attributes
+ */
+void AllocateGeometry(const NDArray *dest,
+                             const OpReqType req,
+                             const NDArray* clone_from = nullptr) {
+  if (req != kNullOp) {
+    if (clone_from) {
+      const mxnet::TShape& ishape = clone_from->storage_shape();
+      dest->CheckAndAllocData(ishape);
+      CHECK_EQ(dest->storage_type(), clone_from->storage_type());
+      for (size_t i = 0, n = clone_from->aux_shapes().size(); i < n; ++i) {
+        dest->CheckAndAllocAuxData(i, clone_from->aux_shape(i));
       }
+      DCHECK_EQ(dest->aux_shapes().size(), clone_from->aux_shapes().size());
+    } else {
+      for (size_t i = 0, n = dest->aux_shapes().size(); i < n; ++i) {
+        dest->CheckAndAllocAuxData(i, dest->aux_shape(i));
+      }
+      dest->CheckAndAllocData(dest->storage_shape());
     }
   }
+}
 
-  /*! \brief Copy the geometry-related blobs (row sparse indexes, etc.) */
-  template<typename xpu>
-  static inline void CopyGeometryBlobs(mshadow::Stream<xpu> *s,
-                                       const NDArray *dest,
-                                       const OpReqType reqi,
-                                       const NDArray &src) {
-    CHECK_EQ(src.aux_shapes().size(), dest->aux_shapes().size());
-    // My assumption is that the geometry blobs are not large enough to justify an omp loop here,
-    // since the thread synchronization calls for each fork will take longer
-    // than copying a few floats
-    for (size_t i = 0, n = src.aux_shapes().size(); i < n; ++i) {
-      const TBlob src_blob = src.aux_data(i);
-      const TBlob dest_blob = dest->aux_data(i);
-      CopyBlob<xpu>(s, &dest_blob, reqi, src_blob);
-    }
+/*! \brief Copy the geometry-related blobs (row sparse indexes, etc.) */
+template<typename xpu>
+inline void CopyGeometryBlobs(mshadow::Stream<xpu> *s,
+                                     const NDArray *dest,
+                                     const OpReqType reqi,
+                                     const NDArray &src) {
+  CHECK_EQ(src.aux_shapes().size(), dest->aux_shapes().size());
+  // My assumption is that the geometry blobs are not large enough to justify an omp loop here,
+  // since the thread synchronization calls for each fork will take longer
+  // than copying a few floats
+  for (size_t i = 0, n = src.aux_shapes().size(); i < n; ++i) {
+    const TBlob src_blob = src.aux_data(i);
+    const TBlob dest_blob = dest->aux_data(i);
+    CopyBlob<xpu>(s, &dest_blob, reqi, src_blob);
   }
+}
+
+}  // namespace
+
+class OpBase {
+ protected:
+  /*! \brief simple kernel to set to a scalar value of arbitrary type */
+  template<int req>
+  using set_to_scalar = mxnet_op::op_with_req<mshadow_op::identity, req>;
+
 
   /*! \brief Generic copy NDArray */
   template<typename xpu>
@@ -172,49 +220,6 @@ class OpBase {
 
 /*! \brief Unary operator class */
 class UnaryOp : public OpBase {
-  /*! \brief Infer the output storage geometry
-   * \return boolean signifying whether the proper storage geometry was initialized
-   */
-  template<int n_in, int n_out>
-  static bool InitStorageGeometry(const nnvm::NodeAttrs& attrs,
-                                  const std::vector<NDArray>& inputs,
-                                  const std::vector<NDArray>& outputs) {
-    CHECK_EQ(inputs.size(), static_cast<size_t>(n_in))
-      << " in operator " << attrs.name;
-    CHECK_EQ(outputs.size(), static_cast<size_t>(n_out))
-      << " in operator " << attrs.name;
-    static_assert(n_in > 0 && n_out > 0, "Invalid input and/or output count values");
-    const mxnet::TShape& isshape = inputs[0].storage_shape();
-    if (!shape_is_none(isshape)) {
-      NDArray *output = nullptr;
-      for (size_t i = 0, n = inputs.size(); i < n; ++i) {
-        const NDArray &input = inputs[i];
-        if (i < n_out) {
-          output = const_cast<NDArray *>(&outputs[i]);
-        }
-        CHECK_EQ(output->shape(), inputs[i].shape());
-        CHECK_EQ(output->storage_type(), input.storage_type());
-        CHECK_EQ(output->aux_shapes().size(), input.aux_shapes().size());
-        mxnet::ShapeVector aux_shapes;
-        const size_t aux_shape_count = input.aux_shapes().size();
-        aux_shapes.reserve(aux_shape_count);
-        for (size_t j = 0; j < aux_shape_count; ++j) {
-          aux_shapes.emplace_back(input.aux_shape(j));
-        }
-        output->CheckAndAlloc(aux_shapes);
-        DCHECK_EQ(output->storage_shape(), input.storage_shape());
-      }
-      return true;
-    }
-    if (isshape.ndim() > 0 && !isshape.Size()
-      && inputs[0].storage_type() != kDefaultStorage) {
-      return true;  // 0% density
-    } else {
-      CHECK(false);  // implement when necessary
-    }
-    return false;
-  }
-
  public:
   /*! \brief Map NDArray vectors to TBlob vectors and pass to compute function */
   template<typename xpu, typename FComputer>
@@ -224,7 +229,7 @@ class UnaryOp : public OpBase {
                                    const std::vector<OpReqType> &req,
                                    const std::vector<NDArray> &outputs,
                                    FComputer computer) {
-    UnaryOp::template InitStorageGeometry<1, 1>(attrs, inputs, outputs);
+    InitStorageGeometry<1, 1>(attrs, inputs, outputs);
     CHECK_EQ(inputs.size(), outputs.size());  // need to figure out what to do for binary type
     CHECK_NE(outputs[0].storage_type(), kDefaultStorage);
     CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type());
@@ -252,16 +257,6 @@ class UnaryOp : public OpBase {
     });
   }
 
-#if MXNET_USE_CUDA
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<gpu>* s,
-                       const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs);
-
-#endif
-
   template<typename xpu, typename OP>
   static void Compute(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
@@ -338,7 +333,8 @@ class UnaryOp : public OpBase {
                         const std::vector<NDArray>& outputs) {
     CHECK_EQ(inputs.size(), 1U);
     CHECK_EQ(outputs.size(), 1U);
-    CHECK_NE(inputs[0].storage_type(), kDefaultStorage);
+    CHECK_NE(inputs[0].storage_type(), kDefaultStorage)
+      << "Operation requires a sparse input storage type";
     CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
       << "Operation requires a sparse output storage type";
     if (inputs[0].storage_shape().Size()) {
@@ -379,7 +375,7 @@ class UnaryOp : public OpBase {
     CHECK_EQ(outputs.size(), 1U)
       << "Invalid output, only one output is allowed";
     CHECK_NE(inputs[0].storage_type(), kDefaultStorage)
-      << "Operation requires a sparse output storage type";
+      << "Operation requires a sparse input storage type";
     CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
       << "Operation requires a sparse output storage type";
     if (inputs[0].storage_shape().Size()) {
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index 6bbba7411851..b7c487d8b770 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -27,15 +27,15 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(relu)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::relu>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::relu>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"relu"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"relu"});
 
 NNVM_REGISTER_OP(_backward_relu)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
   gpu, unary_bwd<mshadow_op::relu_grad>>);
 
 NNVM_REGISTER_OP(sigmoid)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sigmoid>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sigmoid"});
 
 NNVM_REGISTER_OP(_backward_sigmoid)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -49,7 +49,7 @@ NNVM_REGISTER_OP(_backward_hard_sigmoid)
 
 // softsign
 NNVM_REGISTER_OP(softsign)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::softsign>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"softsign"});
 
 NNVM_REGISTER_OP(_backward_softsign)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -57,7 +57,7 @@ NNVM_REGISTER_OP(_backward_softsign)
 
 // erf
 NNVM_REGISTER_OP(erf)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::erf>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"erf"});
 
 NNVM_REGISTER_OP(_backward_erf)
 .set_attr<FCompute>("FCompute<gpu>",
@@ -65,7 +65,7 @@ NNVM_REGISTER_OP(_backward_erf)
 
 // erfinv
 NNVM_REGISTER_OP(erfinv)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::erfinv>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"erfinv"});
 
 NNVM_REGISTER_OP(_backward_erfinv)
 .set_attr<FCompute>("FCompute<gpu>",
@@ -152,21 +152,21 @@ NNVM_REGISTER_OP(_backward_cast)
 
 // negative
 NNVM_REGISTER_OP(negative)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::negation>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::negation>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"negation"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"negation"});
 
 // abs
 NNVM_REGISTER_OP(abs)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::abs>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::abs>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"abs"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"abs"});
 
 NNVM_REGISTER_OP(_backward_abs)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::sign> >);
 
 // sign
 NNVM_REGISTER_OP(sign)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sign>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::sign>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sign"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sign"});
 
 NNVM_REGISTER_OP(_backward_sign)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -174,37 +174,37 @@ NNVM_REGISTER_OP(_backward_sign)
 
 // round
 NNVM_REGISTER_OP(round)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::round>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::round>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"round"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"round"});
 
 // ceil
 NNVM_REGISTER_OP(ceil)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::ceil>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::ceil>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"ceil"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"ceil"});
 
 // floor
 NNVM_REGISTER_OP(floor)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::floor>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::floor>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"floor"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"floor"});
 
 // trunc
 NNVM_REGISTER_OP(trunc)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::trunc>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::trunc>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"trunc"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"trunc"});
 
 // rint
 NNVM_REGISTER_OP(rint)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::rint>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::rint>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"rint"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"rint"});
 
 // fix
 NNVM_REGISTER_OP(fix)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::fix>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::fix>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"fix"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"fix"});
 
 // gamma
 NNVM_REGISTER_OP(gamma)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::gamma>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"gamma"});
 
 NNVM_REGISTER_OP(_backward_gamma)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -212,7 +212,7 @@ NNVM_REGISTER_OP(_backward_gamma)
 
 // gammaln
 NNVM_REGISTER_OP(gammaln)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::gammaln>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"gammaln"});
 
 NNVM_REGISTER_OP(_backward_gammaln)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -228,7 +228,7 @@ NNVM_REGISTER_OP(_backward_digamma)
 
 // logical not
 NNVM_REGISTER_OP(logical_not)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::nt>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"logical_not"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_logexp.cu b/src/operator/tensor/elemwise_unary_op_logexp.cu
index febc1914feb7..42189497af88 100644
--- a/src/operator/tensor/elemwise_unary_op_logexp.cu
+++ b/src/operator/tensor/elemwise_unary_op_logexp.cu
@@ -28,19 +28,19 @@ namespace op {
 
 // exp
 NNVM_REGISTER_OP(exp)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::exp>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"exp"});
 
 // log
 NNVM_REGISTER_OP(log)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"log"});
 
 // log10
 NNVM_REGISTER_OP(log10)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log10>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"log10"});
 
 // log2
 NNVM_REGISTER_OP(log2)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log2>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"log2"});
 
 NNVM_REGISTER_OP(_backward_log)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -56,8 +56,8 @@ NNVM_REGISTER_OP(_backward_log2)
 
 // log1p
 NNVM_REGISTER_OP(log1p)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log1p>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::log1p>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"log1p"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"log1p"});
 
 NNVM_REGISTER_OP(_backward_log1p)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -65,8 +65,8 @@ NNVM_REGISTER_OP(_backward_log1p)
 
 // expm1
 NNVM_REGISTER_OP(expm1)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::expm1>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::expm1>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"expm1"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"expm1"});
 
 NNVM_REGISTER_OP(_backward_expm1)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
diff --git a/src/operator/tensor/elemwise_unary_op_pow.cu b/src/operator/tensor/elemwise_unary_op_pow.cu
index 287a2e87be73..ed555b153261 100644
--- a/src/operator/tensor/elemwise_unary_op_pow.cu
+++ b/src/operator/tensor/elemwise_unary_op_pow.cu
@@ -29,7 +29,7 @@ namespace op {
 
 // reciprocal
 NNVM_REGISTER_OP(reciprocal)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"reciprocal"});
 
 NNVM_REGISTER_OP(_backward_reciprocal)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -37,8 +37,8 @@ NNVM_REGISTER_OP(_backward_reciprocal)
 
 // square
 NNVM_REGISTER_OP(square)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::square>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::square>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"square"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"square"});
 
 NNVM_REGISTER_OP(_backward_square)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -46,8 +46,8 @@ NNVM_REGISTER_OP(_backward_square)
 
 // sqrt
 NNVM_REGISTER_OP(sqrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::square_root>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::square_root>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sqrt"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sqrt"});
 
 NNVM_REGISTER_OP(_backward_sqrt)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -55,7 +55,7 @@ NNVM_REGISTER_OP(_backward_sqrt)
 
 // rsqrt
 NNVM_REGISTER_OP(rsqrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal_square_root>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"rsqrt"});
 
 NNVM_REGISTER_OP(_backward_rsqrt)
 .set_attr<FCompute>("FCompute<gpu>",
@@ -63,8 +63,8 @@ NNVM_REGISTER_OP(_backward_rsqrt)
 
 // cbrt
 NNVM_REGISTER_OP(cbrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::cube_root>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::cube_root>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"cbrt"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"cbrt"});
 
 
 NNVM_REGISTER_OP(_backward_cbrt)
@@ -73,7 +73,7 @@ NNVM_REGISTER_OP(_backward_cbrt)
 
 // rcbrt
 NNVM_REGISTER_OP(rcbrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal_cube_root>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"rcbrt"});
 
 NNVM_REGISTER_OP(_backward_rcbrt)
 .set_attr<FCompute>("FCompute<gpu>",
diff --git a/src/operator/tensor/elemwise_unary_op_trig.cu b/src/operator/tensor/elemwise_unary_op_trig.cu
index f5e9d1ccbd6c..7407252ac53a 100644
--- a/src/operator/tensor/elemwise_unary_op_trig.cu
+++ b/src/operator/tensor/elemwise_unary_op_trig.cu
@@ -29,8 +29,8 @@ namespace op {
 
 // sin
 NNVM_REGISTER_OP(sin)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sin>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::sin>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sin"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sin"});
 
 NNVM_REGISTER_OP(_backward_sin)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -38,7 +38,7 @@ NNVM_REGISTER_OP(_backward_sin)
 
 // cos
 NNVM_REGISTER_OP(cos)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::cos>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"cos"});
 
 NNVM_REGISTER_OP(_backward_cos)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -46,8 +46,8 @@ NNVM_REGISTER_OP(_backward_cos)
 
 // tan
 NNVM_REGISTER_OP(tan)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::tan>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::tan>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"tan"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"tan"});
 
 NNVM_REGISTER_OP(_backward_tan)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -55,8 +55,8 @@ NNVM_REGISTER_OP(_backward_tan)
 
 // arcsin
 NNVM_REGISTER_OP(arcsin)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arcsin>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::arcsin>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"arcsin"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"arcsin"});
 
 NNVM_REGISTER_OP(_backward_arcsin)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -64,7 +64,7 @@ NNVM_REGISTER_OP(_backward_arcsin)
 
 // arccos
 NNVM_REGISTER_OP(arccos)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arccos>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"arccos"});
 
 NNVM_REGISTER_OP(_backward_arccos)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -72,8 +72,8 @@ NNVM_REGISTER_OP(_backward_arccos)
 
 // arctan
 NNVM_REGISTER_OP(arctan)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arctan>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::arctan>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"arctan"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"arctan"});
 
 NNVM_REGISTER_OP(_backward_arctan)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -81,8 +81,8 @@ NNVM_REGISTER_OP(_backward_arctan)
 
 // degrees
 NNVM_REGISTER_OP(degrees)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::degrees>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::degrees>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"degrees"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"degrees"});
 
 NNVM_REGISTER_OP(_backward_degrees)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -90,8 +90,8 @@ NNVM_REGISTER_OP(_backward_degrees)
 
 // radians
 NNVM_REGISTER_OP(radians)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::radians>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::radians>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"radians"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"radians"});
 
 NNVM_REGISTER_OP(_backward_radians)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -99,7 +99,7 @@ NNVM_REGISTER_OP(_backward_radians)
 
 // cosh
 NNVM_REGISTER_OP(cosh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::cosh>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"cosh"});
 
 NNVM_REGISTER_OP(_backward_cosh)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -107,8 +107,8 @@ NNVM_REGISTER_OP(_backward_cosh)
 
 // sinh
 NNVM_REGISTER_OP(sinh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sinh>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::sinh>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sinh"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sinh"});
 
 NNVM_REGISTER_OP(_backward_sinh)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -116,8 +116,8 @@ NNVM_REGISTER_OP(_backward_sinh)
 
 // tanh
 NNVM_REGISTER_OP(tanh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::tanh>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::tanh>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"tanh"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"tanh"});
 
 NNVM_REGISTER_OP(_backward_tanh)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -125,8 +125,8 @@ NNVM_REGISTER_OP(_backward_tanh)
 
 // arcsinh
 NNVM_REGISTER_OP(arcsinh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arcsinh>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::arcsinh>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"arcsinh"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"arcsinh"});
 
 NNVM_REGISTER_OP(_backward_arcsinh)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -134,7 +134,7 @@ NNVM_REGISTER_OP(_backward_arcsinh)
 
 // arccosh
 NNVM_REGISTER_OP(arccosh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arccosh>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"arccosh"});
 
 NNVM_REGISTER_OP(_backward_arccosh)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
@@ -142,8 +142,8 @@ NNVM_REGISTER_OP(_backward_arccosh)
 
 // arctanh
 NNVM_REGISTER_OP(arctanh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arctanh>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::arctanh>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"arctanh"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"arctanh"});
 
 NNVM_REGISTER_OP(_backward_arctanh)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py
index 8d3ce47c18e8..a100f8b78c13 100644
--- a/tests/python/gpu/test_fusion.py
+++ b/tests/python/gpu/test_fusion.py
@@ -108,6 +108,7 @@ def check_unary_ops():
             'gammaln',
             'erf',
             'negative',
+            'logical_not',
             ]
 
     def announce_check(op_name):

From c5aeeef452c4a49f4e1ee8467f46058c3aeae5a9 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 29 May 2020 11:16:10 -0700
Subject: [PATCH 05/64] Support binary_scalar forward

Remove elemwise_scatter_op.*

Fix BinaryScalar usage in NumPy
---
 python/mxnet/contrib/amp/lists/symbol_fp16.py |   3 -
 src/common/cuda/rtc/forward_functions-inl.h   | 180 ++++++++++
 src/common/cuda/rtc/type-inl.h                |   5 +
 .../contrib/gradient_multiplier_op.cu         |   4 +-
 src/operator/fusion/fused_op-inl.h            |   3 +
 .../numpy/np_elemwise_broadcast_logic_op.cu   |   2 +-
 .../numpy/np_elemwise_broadcast_op.cu         |  16 +-
 .../np_elemwise_broadcast_op_extended.cu      |  20 +-
 .../tensor/elemwise_binary_scalar_op.cuh      | 194 +++++++----
 .../tensor/elemwise_binary_scalar_op.h        |  10 -
 .../tensor/elemwise_binary_scalar_op_basic.cu |  28 +-
 .../elemwise_binary_scalar_op_extended.cu     |  12 +-
 .../tensor/elemwise_binary_scalar_op_logic.cu |  30 +-
 src/operator/tensor/elemwise_scatter_op.cc    | 142 --------
 src/operator/tensor/elemwise_scatter_op.cu    |  47 ---
 src/operator/tensor/elemwise_scatter_op.h     | 318 ------------------
 src/operator/tensor/elemwise_unary_op.cuh     |  76 +++--
 tests/python/unittest/test_sparse_operator.py | 146 --------
 18 files changed, 408 insertions(+), 828 deletions(-)
 delete mode 100644 src/operator/tensor/elemwise_scatter_op.cc
 delete mode 100644 src/operator/tensor/elemwise_scatter_op.cu
 delete mode 100644 src/operator/tensor/elemwise_scatter_op.h

diff --git a/python/mxnet/contrib/amp/lists/symbol_fp16.py b/python/mxnet/contrib/amp/lists/symbol_fp16.py
index 506757307352..5858eb9ff092 100644
--- a/python/mxnet/contrib/amp/lists/symbol_fp16.py
+++ b/python/mxnet/contrib/amp/lists/symbol_fp16.py
@@ -184,8 +184,6 @@
     '_sample_poisson',
     '_sample_uniform',
     '_sample_unique_zipfian',
-    '_scatter_minus_scalar',
-    '_scatter_plus_scalar',
     '_scatter_set_nd',
     '_set_value',
     '_shuffle',
@@ -508,7 +506,6 @@
     '_Mul',
     '_Div',
     '_div',
-    '_scatter_elemwise_div',
     '_Mod',
     '_Not_Equal',
     '_Equal',
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 756c5e5efa01..e29ec3054f6e 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -284,6 +284,11 @@ __device__ inline DType sub(const DType a, const DType2 b) {
   return a - b;
 }
 
+template <typename DType, typename DType2>
+__device__ inline DType rsub(const DType a, const DType2 b) {
+  return b - a;
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType mul(const DType a, const DType2 b) {
   return a * b;
@@ -330,6 +335,181 @@ __device__ inline DType min(const DType a, const DType2 b) {
 
 DEFINE_BINARY_MATH_FUNC(hypot, ::hypot, ::hypotf)
 
+template <typename DType, typename DType2>
+__device__ inline DType mod(const DType a, const DType2 b) {
+  if (b == 0) {
+    return 0;
+  }
+  const double ad = static_cast<double>(a);
+  const double bd = static_cast<double>(b);
+  if (bd < 0) {
+    if (ad < 0) {
+      return -::fmod(-ad, -bd);
+    } else {
+      return ::fmod(ad, -bd) +
+             (::fmod(ad, -bd) != 0 ? bd : 0);
+    }
+  } else {
+    if (ad < 0) {
+      return -::fmod(-ad, bd) +
+              (::fmod(-ad, bd) != 0 ? bd : 0);
+    } else {
+      return ::fmod(ad, bd);
+    }
+  }
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType rmod(const DType a, const DType2 b) {
+  return op::mod(b, a);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType equal(const DType a, const DType2 b) {
+  return a == static_cast<DType>(b) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType not_equal(const DType a, const DType2 b) {
+  return a != static_cast<DType>(b) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType greater(const DType a, const DType2 b) {
+  return a > static_cast<DType>(b) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType greater_equal(const DType a, const DType2 b) {
+  return a >= static_cast<DType>(b) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType less(const DType a, const DType2 b) {
+  return a < static_cast<DType>(b) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType less_equal(const DType a, const DType2 b) {
+  return a <= static_cast<DType>(b) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_equal(const DType a, const DType2 b) {
+  return a == static_cast<DType>(b) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_not_equal(const DType a, const DType2 b) {
+  return a != static_cast<DType>(b) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_greater(const DType a, const DType2 b) {
+  return a > static_cast<DType>(b) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_greater_equal(const DType a, const DType2 b) {
+  return a >= static_cast<DType>(b) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_less(const DType a, const DType2 b) {
+  return a < static_cast<DType>(b) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_less_equal(const DType a, const DType2 b) {
+  return a <= static_cast<DType>(b) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType logical_and(const DType a, const DType2 b) {
+  return a && static_cast<DType>(b) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType logical_or(const DType a, const DType2 b) {
+  return a || static_cast<DType>(b) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType logical_xor(const DType a, const DType2 b) {
+  const DType bb = static_cast<DType>(b);
+  return ((a || bb) && !(a && bb)) ? 1 : 0;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType copysign(const DType a, const DType2 b) {
+  return (a >= 0 && b >= 0) || (a < 0 && b < 0) ? a : -a;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType rcopysign(const DType a, const DType2 b) {
+  return copysign(b, a);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType lcm(const DType a, const DType2 b) {
+  if (type_util::is_integral<DType>::value &&
+      type_util::is_integral<DType2>::value) {
+    DType A = a;
+    DType2 B = b;
+    // minus cases.
+    if (a < 0) {
+      A = -a;
+    }
+    if (b < 0) {
+      B = -b;
+    }
+    // handle zero-valued cases.
+    DType c;
+    if (a == 0 || b == 0) {
+      c = 0;
+    } else {
+      DType tmp;
+      DType tmp_a = A;
+      DType tmp_b = B;
+      if (A < B) {
+        tmp = A;
+        A = B;
+        B = tmp;
+      }
+      while (A % B != 0) {
+        A = A % B;
+        tmp = A;
+        A = B;
+        B = tmp;
+      }
+      c = tmp_a / B * tmp_b;
+    }
+    return c;
+  } else {
+    return 0;
+  }
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType bitwise_xor(const DType a, const DType2 b) {
+  return static_cast<int64_t>(a) ^ static_cast<int64_t>(b);
+}
+
+
+DEFINE_BINARY_MATH_FUNC(arctan2, ::atan2, ::atan2f)
+
+template <typename DType, typename DType2>
+__device__ inline DType rarctan2(const DType a, const DType2 b) {
+  return arctan2(b, a);
+}
+
+DEFINE_BINARY_MATH_FUNC(ldexp, a * ::pow(2.0f, b), a * ::powf(2.0f, b))
+
+template <typename DType, typename DType2>
+__device__ inline DType rldexp(const DType a, const DType2 b) {
+  return ldexp(b, a);
+}
+
 #undef DEFINE_BINARY_MATH_FUNC
 
 template <typename OutType, typename DType>
diff --git a/src/common/cuda/rtc/type-inl.h b/src/common/cuda/rtc/type-inl.h
index 9165926ac1e8..888a1f734587 100644
--- a/src/common/cuda/rtc/type-inl.h
+++ b/src/common/cuda/rtc/type-inl.h
@@ -56,6 +56,11 @@ template <> struct is_integral<int32> : true_type {};
 template <> struct is_integral<int64> : true_type {};
 template <> struct is_integral<bool>  : true_type {};
 
+// is_unsigned
+template <typename T> struct is_unsigned : false_type {};
+template <> struct is_unsigned<uint8> : true_type {};
+template <> struct is_unsigned<bool>  : true_type {};
+
 // is_same
 template <typename T, typename U>
 struct is_same : false_type {};
diff --git a/src/operator/contrib/gradient_multiplier_op.cu b/src/operator/contrib/gradient_multiplier_op.cu
index 7159cea9805d..f519f0db5f49 100644
--- a/src/operator/contrib/gradient_multiplier_op.cu
+++ b/src/operator/contrib/gradient_multiplier_op.cu
@@ -34,8 +34,8 @@ NNVM_REGISTER_OP(_contrib_gradientmultiplier)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
 NNVM_REGISTER_OP(_contrib_backward_gradientmultiplier)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::mul>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mul"})
+.set_attr<FComputeEx>("(FComputeEx<gpu>", BinaryScalarRTCCompute{"mul"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h
index ea2303e3aa1f..066f54c0744e 100644
--- a/src/operator/fusion/fused_op-inl.h
+++ b/src/operator/fusion/fused_op-inl.h
@@ -51,6 +51,7 @@ const std::map<std::string, std::vector<std::vector<std::string>>> ops_desc = {
   {"_maximum"                          , {{"op::max(%, %)", "_0", "_1"}}},
   {"_Minimum"                          , {{"op::min(%, %)", "_0", "_1"}}},
   {"_minimum"                          , {{"op::min(%, %)", "_0", "_1"}}},
+  {"_mod"                              , {{"op::mod(%, %)", "_0", "_1"}}},
   {"amp_cast"                          , {{"op::identity(%)", "_0"}}},
   {"_backward_amp_cast"                , {{"op::identity(%)", "_0"}}},
   {"relu"                              , {{"op::relu(%)", "_0"}}},
@@ -120,6 +121,8 @@ const std::map<std::string, std::vector<std::vector<std::string>>> ops_desc = {
   {"_rpower_scalar"                    , {{"op::rpow(%, float(%))", "_0", "scalar"}}},
   {"_RPowerScalar"                     , {{"op::rpow(%, float(%))", "_0", "scalar"}}},
   {"_RDivScalar"                       , {{"op::rdiv(%, float(%))", "_0", "scalar"}}},
+  {"_mod_scalar"                       , {{"op::mod(%, float(%))", "_0", "scalar"}}},
+  {"_rmod_scalar"                      , {{"op::rmod(%, float(%))", "_0", "scalar"}}},
   {"Cast"                              , {{"op::cast<%>(%)", "dtype", "_0"}}},
   {"cast"                              , {{"op::cast<%>(%)", "dtype", "_0"}}},
   {"Activation"                        , {{"op::%(%)", "act_type", "_0"}}},
diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.cu b/src/operator/numpy/np_elemwise_broadcast_logic_op.cu
index e27535d9f4f5..42acbea563e6 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.cu
@@ -38,7 +38,7 @@ namespace op {
 
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_GPU(name)                               \
   NNVM_REGISTER_OP(_npi_##name##_scalar)                                                       \
-  .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::ComputeLogic<gpu, mshadow_op::np_##name>)
+  .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"np_" #name})
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_GPU(equal);
 MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_GPU(not_equal);
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index a2927cda61ff..82958924d933 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -79,28 +79,28 @@ NNVM_REGISTER_OP(_backward_npi_broadcast_power)
                                                               mshadow_op::power_rgrad>);
 
 NNVM_REGISTER_OP(_npi_add_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"add"});
 
 NNVM_REGISTER_OP(_npi_subtract_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::minus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"sub"});
 
 NNVM_REGISTER_OP(_npi_rsubtract_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rminus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rsub"});
 
 NNVM_REGISTER_OP(_npi_multiply_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mul"});
 
 NNVM_REGISTER_OP(_npi_mod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_npi_rmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rmod"});
 
 NNVM_REGISTER_OP(_npi_power_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_npi_rpower_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rpow"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
index 8f135b3efd03..9fdbac6b65fa 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
@@ -60,10 +60,10 @@ NNVM_REGISTER_OP(_backward_npi_hypot)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::hypot_grad_left,
                                                                   mshadow_op::hypot_grad_right>);
 NNVM_REGISTER_OP(_npi_copysign_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::copysign>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"copysign"});
 
 NNVM_REGISTER_OP(_npi_rcopysign_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rcopysign>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rcopysign"});
 
 NNVM_REGISTER_OP(_backward_npi_copysign_scalar)
 .set_attr<FCompute>("FCompute<gpu>",
@@ -74,25 +74,25 @@ NNVM_REGISTER_OP(_backward_npi_rcopysign_scalar)
                     BinaryScalarOp::Backward<gpu, mshadow_op::rcopysign_grad>);
 
 NNVM_REGISTER_OP(_npi_arctan2_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::arctan2>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"arctan2"});
 
 NNVM_REGISTER_OP(_backward_npi_arctan2_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::arctan2_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::arctan2_grad>);
 
 NNVM_REGISTER_OP(_npi_rarctan2_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rarctan2>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rarctan2"});
 
 NNVM_REGISTER_OP(_backward_npi_rarctan2_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rarctan2_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rarctan2_grad>);
 
 NNVM_REGISTER_OP(_npi_lcm_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::ComputeInt<gpu, mshadow_op::lcm>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"lcm"});
 
 NNVM_REGISTER_OP(_npi_bitwise_and_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::ComputeInt<gpu, mshadow_op::bitwise_and>);
 
 NNVM_REGISTER_OP(_npi_bitwise_xor_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::ComputeInt<gpu, mshadow_op::bitwise_xor>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"bitwise_xor"});
 
 NNVM_REGISTER_OP(_npi_bitwise_or_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::ComputeInt<gpu, mshadow_op::bitwise_or>);
@@ -101,10 +101,10 @@ NNVM_REGISTER_OP(_npi_ldexp)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::ldexp>);
 
 NNVM_REGISTER_OP(_npi_ldexp_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::ldexp>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"ldexp"});
 
 NNVM_REGISTER_OP(_npi_rldexp_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rldexp>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rldexp"});
 
 NNVM_REGISTER_OP(_backward_npi_ldexp)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::ldexp_grad,
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh
index 497098d76ff9..ac18443bb550 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cuh
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh
@@ -29,6 +29,7 @@
 #include <cuda_runtime.h>
 #include "../operator_common.h"
 #include "../../common/cuda/vectorization.cuh"
+#include "elemwise_unary_op.h"
 
 #include <vector>
 
@@ -50,36 +51,6 @@ struct VectorizedKernelParams {
   DType scalar;
 };
 
-template <bool aligned, typename DType, typename LType, typename OP, int req>
-__global__ void VectorizedBinaryScalarKernelFwd(const VectorizedKernelParams<DType, 1, 1> params,
-                                                const index_t N) {
-  VectorizedLoader<DType, LType, aligned> loader0(params.inputs[0], N);
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
-
-  const index_t M = loader0.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader0.load(tid, N);
-    if (req == kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < loader0.nvec(); ++i) {
-      DType temp = OP::Map(loader0.separate()[i],
-                           params.scalar);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
 template <bool aligned, typename DType, typename LType, typename OP, int req>
 __global__ void VectorizedBinaryScalarKernelBwd(const VectorizedKernelParams<DType, 2, 1> params,
                                                 const index_t N) {
@@ -113,21 +84,6 @@ __global__ void VectorizedBinaryScalarKernelBwd(const VectorizedKernelParams<DTy
   }
 }
 
-template <typename DType, typename OP, int req>
-class VectorizedBinaryScalarFwd {
- public:
-  using ParamType = VectorizedKernelParams<DType, 1, 1>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryScalarKernelFwd<aligned, DType, LType, OP, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
 template <typename DType, typename OP, int req>
 class VectorizedBinaryScalarBwd {
  public:
@@ -145,35 +101,133 @@ class VectorizedBinaryScalarBwd {
 
 }  // namespace binary_scalar
 
-template <typename OP>
-void BinaryScalarOp::Compute_(const nnvm::NodeAttrs &attrs,
-                              const OpContext &ctx,
-                              mshadow::Stream<gpu>* s,
-                              const std::vector<TBlob> &inputs,
-                              const std::vector<OpReqType> &req,
-                              const std::vector<TBlob> &outputs) {
-  using namespace binary_scalar;
-  if (req[0] == kNullOp) return;
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
-  const double alpha = param.scalar;
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint4;
-      using Kernel = VectorizedBinaryScalarFwd<DType, OP, Req>;
+struct binary_scalar_kernel_params {
+  const void *inputs[1];
+  void *outputs[1];
+  double scalar;
+};
 
-      const index_t size = outputs[0].Size();
-      typename Kernel::ParamType params;
-      params.inputs[0] = inputs[0].dptr<DType>();
-      params.outputs[0] = outputs[0].dptr<DType>();
-      params.scalar = (DType)alpha;
+const char binary_scalar_kernel_fwd[] = R"code(
 
-      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-    });
-  });
+struct binary_scalar_kernel_params {
+  const void *inputs[1];
+  void *outputs[1];
+  double scalar;
+};
+
+__global__ void binary_scalar_kernel(const binary_scalar_kernel_params params,
+                                     const index_t lead_dim,
+                                     const index_t other_dim,
+                                     const index_t N,
+                                     const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using IType = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader.load(tid, N);
+    if (req == OpReqType::kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto input = IType::from(loader.separate()[i]);
+      // enables returning different type
+      const auto temp = OP(input, static_cast<typename IType::type>(params.scalar));
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(tid, N);
+  }
 }
 
+)code";
+
+struct BinaryScalarRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+    using namespace mxnet::common::cuda::rtc;
+    if (req[0] == kNullOp) return;
+    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
+
+    const std::string code = std::string("const OpReqType req = ") +
+                             util::to_string(req[0]) +
+                             ";\n" +
+                             "#define OP op::" +
+                             OP +
+                             "\n" +
+                             binary_scalar_kernel_fwd;
+    const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+    const index_t size = outputs[0].Size();
+    binary_scalar_kernel_params params = { {inputs[0].dptr_},
+                                           {outputs[0].dptr_},
+                                           alpha };
+
+    VectorizedKernelRTCLauncher(code, "binary_scalar_kernel", nvec,
+                                size, 1, s, params,
+                                inputs, outputs,
+                                ctx.run_ctx.get_ctx().dev_id);
+  }
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<NDArray>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<NDArray>& outputs) {
+    if (req[0] == kNullOp) {
+      return;
+    }
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
+    InitStorageGeometry<1, 1>(attrs, inputs, outputs);
+    CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+      << "This function works only for sparse types.";
+    CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type())
+      << "The storage type of both inputs and outputs needs to be the same.";
+    AllocateGeometry(&outputs[0], req[0], &inputs[0]);
+    CopyGeometryBlobs<gpu>(ctx.get_stream<gpu>(), &outputs[0], req[0], inputs[0]);
+    outputs[0].CheckAndAllocData(inputs[0].storage_shape());
+    if (inputs[0].storage_shape().Size()) {
+      std::vector<TBlob> in_blobs, out_blobs;
+      in_blobs.reserve(inputs.size());
+      out_blobs.reserve(outputs.size());
+      for (auto &input : inputs) {
+        in_blobs.emplace_back(input.data());
+      }
+      for (auto &output : outputs) {
+        out_blobs.emplace_back(output.data());
+      }
+      this->operator()(attrs, ctx, in_blobs, req, out_blobs);
+    }
+  }
+};
+
 template <typename OP>
 void BinaryScalarOp::Backward_(const nnvm::NodeAttrs &attrs,
                                mshadow::Stream<gpu>* s,
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 568daee2f589..be9adee35fb9 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -298,16 +298,6 @@ class BinaryScalarOp : public UnaryOp {
     });
   }
 
-#if MXNET_USE_CUDA
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs &attrs,
-                       const OpContext &ctx,
-                       mshadow::Stream<gpu>* s,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs);
-#endif
-
   template<typename xpu, typename OP>
   static void Compute(const nnvm::NodeAttrs &attrs,
                       const OpContext &ctx,
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
index 3fd017f09ec7..1425702459db 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
@@ -29,44 +29,44 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(_plus_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"add"});
 
 NNVM_REGISTER_OP(_minus_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::minus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"sub"});
 
 NNVM_REGISTER_OP(_rminus_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rminus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rsub"});
 
 NNVM_REGISTER_OP(_mul_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::mul>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mul"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"mul"});
 
 NNVM_REGISTER_OP(_backward_mul_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::mul>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mul"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"mul"});
 
 NNVM_REGISTER_OP(_div_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::div>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::div>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"div"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"div"});
 
 NNVM_REGISTER_OP(_backward_div_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::div>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::div>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"div"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"div"});
 
 NNVM_REGISTER_OP(_rdiv_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rdiv>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rdiv"});
 
 NNVM_REGISTER_OP(_backward_rdiv_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rdiv_grad>);
 
 NNVM_REGISTER_OP(_mod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_mod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::mod_grad>);
 
 NNVM_REGISTER_OP(_rmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rmod"});
 
 NNVM_REGISTER_OP(_backward_rmod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rmod_grad>);
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
index f09e40a2eee7..9bb047569bf7 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
@@ -29,37 +29,37 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(_maximum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::maximum>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"max"});
 
 NNVM_REGISTER_OP(_backward_maximum_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::ge>);
 
 NNVM_REGISTER_OP(_minimum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::minimum>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"min"});
 
 NNVM_REGISTER_OP(_backward_minimum_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::le>);
 
 NNVM_REGISTER_OP(_power_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_power_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::power_grad>);
 
 NNVM_REGISTER_OP(_rpower_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rpow"});
 
 NNVM_REGISTER_OP(_backward_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rpower_grad>);
 
 NNVM_REGISTER_OP(_hypot_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::hypot>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"hypot"});
 
 NNVM_REGISTER_OP(_backward_hypot_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::hypot_grad_left>);
 
 NNVM_REGISTER_OP(smooth_l1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::smooth_l1_loss>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"smooth_l1"});
 
 NNVM_REGISTER_OP(_backward_smooth_l1)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::smooth_l1_gradient>);
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
index 6c393e0719a5..70ef26b0a5e5 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
@@ -28,37 +28,37 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_equal_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::eq>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::LogicComputeEx<gpu, mshadow_op::eq>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"equal"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"equal"});
 
 NNVM_REGISTER_OP(_not_equal_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::ne>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::LogicComputeEx<gpu, mshadow_op::ne>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"not_equal"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"not_equal"});
 
 NNVM_REGISTER_OP(_greater_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::gt>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::LogicComputeEx<gpu, mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"greater"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"greater"});
 
 NNVM_REGISTER_OP(_greater_equal_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::ge>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::LogicComputeEx<gpu, mshadow_op::ge>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"greater_equal"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"greater_equal"});
 
 NNVM_REGISTER_OP(_lesser_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::lt>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::LogicComputeEx<gpu, mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"less"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"less"});
 
 NNVM_REGISTER_OP(_lesser_equal_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::le>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::LogicComputeEx<gpu, mshadow_op::le>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"less_equal"})
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarRTCCompute{"less_equal"});
 
 NNVM_REGISTER_OP(_logical_and_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::logical_and>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"logical_and"});
 
 NNVM_REGISTER_OP(_logical_or_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::logical_or>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"logical_or"});
 
 NNVM_REGISTER_OP(_logical_xor_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::logical_xor>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"logical_xor"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_scatter_op.cc b/src/operator/tensor/elemwise_scatter_op.cc
deleted file mode 100644
index 41f22b057a53..000000000000
--- a/src/operator/tensor/elemwise_scatter_op.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file elemwise_scatter_op.cc
- * \brief CPU implementation of elementwise scatter operators
- */
-#include "./elemwise_binary_op-inl.h"
-#include "./elemwise_binary_scalar_op.h"
-#include "./elemwise_scatter_op.h"
-
-namespace mxnet {
-namespace op {
-
-static bool StorageTypeRspOrDenseOutput(const NodeAttrs& attrs,
-                                        const int dev_mask,
-                                        DispatchMode* dispatch_mode,
-                                        std::vector<int>* in_attrs,
-                                        std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  const auto lhs_stype = static_cast<NDArrayStorageType>((*in_attrs)[0]);
-  if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
-      && common::ContainsOnlyStorage(*out_attrs, kDefaultStorage)) {
-    if (storage_type_assign(&out_attrs[0], kDefaultStorage,
-                            dispatch_mode, DispatchMode::kFCompute)) {
-      return true;
-    }
-  }
-  if (lhs_stype == kRowSparseStorage) {
-    if (storage_type_assign(&out_attrs[0], kRowSparseStorage,
-                            dispatch_mode,
-                            DispatchMode::kFComputeEx)) {
-      return true;
-    }
-  }
-  return dispatch_fallback(out_attrs, dispatch_mode);
-}
-
-static bool StorageTypeScatteredScalarOp(const NodeAttrs& attrs,
-                                         const int dev_mask,
-                                         DispatchMode* dispatch_mode,
-                                         std::vector<int>* in_attrs,
-                                         std::vector<int>* out_attrs) {
-  // Supports kDefaultStorage, kRowSparseStorage and kCSRStorage
-  const auto stype = static_cast<NDArrayStorageType>((*in_attrs)[0]);
-  if (storage_type_assign(out_attrs,
-                         stype,
-                         dispatch_mode,
-                         stype == kDefaultStorage ? DispatchMode::kFCompute
-                                                  : DispatchMode::kFComputeEx)) {
-    return true;
-  }
-  return dispatch_fallback(out_attrs, dispatch_mode);
-}
-
-/*! \brief _scatter_elemwise_div */
-MXNET_OPERATOR_REGISTER_BINARY(_scatter_elemwise_div)
-.set_attr<FCompute>("FCompute<cpu>", ElemwiseScatterBinaryOp::Compute<cpu, op::mshadow_op::div>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseScatterBinaryOp::ComputeEx<
-  cpu, op::mshadow_op::div>)
-.describe(R"code(Divides arguments element-wise.  If the left-hand-side input is 'row_sparse', then
-only the values which exist in the left-hand sparse array are computed.  The 'missing' values
-are ignored.
-
-The storage type of ``_scatter_elemwise_div`` output depends on storage types of inputs
-
-- _scatter_elemwise_div(row_sparse, row_sparse) = row_sparse
-- _scatter_elemwise_div(row_sparse, dense) = row_sparse
-- _scatter_elemwise_div(row_sparse, csr) = row_sparse
-- otherwise, ``_scatter_elemwise_div`` behaves exactly like elemwise_div and generates output
-with default storage
-
-)code")
-.set_attr<FInferStorageType>("FInferStorageType", StorageTypeRspOrDenseOutput)
-.set_attr<FResourceRequest>("FResourceRequest",
-                            [](const NodeAttrs& attrs) {
-                              return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-                            })
-.set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_div"});
-
-/*! \brief _scatter_plus_scalar */
-MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_scatter_plus_scalar)
-.describe(R"code(Adds a scalar to a tensor element-wise.  If the left-hand-side input is
-'row_sparse' or 'csr', then only the values which exist in the left-hand sparse array are computed.
-The 'missing' values are ignored.
-
-The storage type of ``_scatter_plus_scalar`` output depends on storage types of inputs
-
-- _scatter_plus_scalar(row_sparse, scalar) = row_sparse
-- _scatter_plus_scalar(csr, scalar) = csr
-- otherwise, ``_scatter_plus_scalar`` behaves exactly like _plus_scalar and generates output
-with default storage
-
-)code")
-.set_attr<FInferStorageType>("FInferStorageType", StorageTypeScatteredScalarOp)
-.set_attr<FCompute>("FCompute<cpu>",
-                    ElemwiseScatterBinaryScalarOp::Compute<cpu, op::mshadow_op::plus>)
-.set_attr<FComputeEx>("FComputeEx<cpu>",
-                      ElemwiseScatterBinaryScalarOp::ComputeEx<cpu, op::mshadow_op::plus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
-
-/*! \brief _scatter_minus_scalar */
-MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_scatter_minus_scalar)
-  .describe(R"code(Subtracts a scalar to a tensor element-wise.  If the left-hand-side input is
-'row_sparse' or 'csr', then only the values which exist in the left-hand sparse array are computed.
-The 'missing' values are ignored.
-
-The storage type of ``_scatter_minus_scalar`` output depends on storage types of inputs
-
-- _scatter_minus_scalar(row_sparse, scalar) = row_sparse
-- _scatter_minus_scalar(csr, scalar) = csr
-- otherwise, ``_scatter_minus_scalar`` behaves exactly like _minus_scalar and generates output
-with default storage
-
-)code")
-.set_attr<FInferStorageType>("FInferStorageType", StorageTypeScatteredScalarOp)
-.set_attr<FCompute>("FCompute<cpu>",
-                    ElemwiseScatterBinaryScalarOp::Compute<cpu, op::mshadow_op::minus>)
-.set_attr<FComputeEx>("FComputeEx<cpu>",
-                      ElemwiseScatterBinaryScalarOp::ComputeEx<cpu, op::mshadow_op::minus>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_scatter_op.cu b/src/operator/tensor/elemwise_scatter_op.cu
deleted file mode 100644
index 913aa9512193..000000000000
--- a/src/operator/tensor/elemwise_scatter_op.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
- /*!
- * \file elemwise_scatter_op.cu
- * \brief GPU implementation of elementwise scatter operators
- */
-#include "./elemwise_binary_scalar_op.h"
-#include "./elemwise_scatter_op.h"
-
-namespace mxnet {
-namespace op {
-
-NNVM_REGISTER_OP(_scatter_elemwise_div)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseScatterBinaryOp::Compute<gpu, op::mshadow_op::div>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseScatterBinaryOp::ComputeEx<gpu,
-  op::mshadow_op::div>);
-
-NNVM_REGISTER_OP(_scatter_plus_scalar)
-.set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseScatterBinaryScalarOp::Compute<gpu, op::mshadow_op::plus>)
-.set_attr<FComputeEx>("FComputeEx<gpu>",
-                      ElemwiseScatterBinaryScalarOp::ComputeEx<gpu, op::mshadow_op::plus>);
-
-NNVM_REGISTER_OP(_scatter_minus_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::minus>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::minus>);
-
-}  // namespace op
-}  // namespace mxnet
-
diff --git a/src/operator/tensor/elemwise_scatter_op.h b/src/operator/tensor/elemwise_scatter_op.h
deleted file mode 100644
index 0e52a86da8bf..000000000000
--- a/src/operator/tensor/elemwise_scatter_op.h
+++ /dev/null
@@ -1,318 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file elemwise_scatter_op.h
- * \brief Function definition of elementwise scatter operators
- */
-#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_SCATTER_OP_H_
-#define MXNET_OPERATOR_TENSOR_ELEMWISE_SCATTER_OP_H_
-
-#include <vector>
-#include "./elemwise_binary_op.h"
-#include "./elemwise_binary_scalar_op.h"
-#include "sparse_retain-inl.h"
-#include "cast_storage-inl.h"
-
-namespace mxnet {
-namespace op {
-
-/*!
- * \brief Shared helper functions for scatter ops
- */
-class ScatterOpBase {
-  /*! \brief Protected in order to prevent widespread use. Scatter ops is a special case */
- protected:
-  /*!
-   * \brief For some situations, we need to do the computation as dense and then use
-   * sparse-retain to strip out the portions we aren't interested in.
-   * \note If your operastor uses this function, it must request kTempStorage
-   * \tparam xpu gpu or cpu
-   * \tparam Function Function to call with dense inputs and outputs
-   * \param attrs Operator attributes
-   * \param ctx Operator context
-   * \param inputs Input NDArrays
-   * \param req Operation request
-   * \param outputs Output NDArrays
-   * \param function
-   */
-  template<typename xpu, typename Function>
-  static void ComputeAsDense(const nnvm::NodeAttrs &attrs,
-                             const OpContext &ctx,
-                             const std::vector<NDArray> &inputs,
-                             const std::vector<OpReqType> &req,
-                             const std::vector<NDArray> &outputs,
-                             Function function) {
-    std::vector<bool> output_converted;
-    std::vector<TBlob>   input_data, output_data;
-    std::vector<NDArray> other_inputs, other_outputs;
-    other_inputs.reserve(inputs.size());
-    input_data.reserve(inputs.size());
-    output_data.reserve(outputs.size());
-    other_outputs.reserve(outputs.size());
-    output_converted.reserve(outputs.size());
-    // Inputs...
-    for (const NDArray& nd : inputs) {
-      if (nd.storage_type() != kDefaultStorage) {
-        NDArray in(nd.shape(), ctx.run_ctx.get_ctx());
-        CastStorageComputeEx<xpu>(attrs, ctx, { nd }, req, { in });
-        other_inputs.push_back(in);
-        input_data.push_back(in.data());
-      } else {
-        input_data.push_back(nd.data());
-      }
-    }
-
-    // Outputs...
-    for (const NDArray& nd : outputs) {
-      if (nd.storage_type() != kDefaultStorage) {
-        NDArray out(nd.shape(), ctx.run_ctx.get_ctx());
-        CastStorageComputeEx<xpu>(attrs, ctx, { nd }, req, { out });
-        other_outputs.push_back(out);
-        output_data.push_back(out.data());
-        output_converted.push_back(true);
-      } else {
-        other_outputs.push_back(nd);
-        output_data.push_back(nd.data());
-        output_converted.push_back(false);
-      }
-    }
-
-    // Call the function
-    function(attrs, ctx, input_data, req, output_data);
-
-    // Convert output(s) back if necessary
-    for (size_t i = 0, n = outputs.size(); i < n; ++i) {
-      if (output_converted[i]) {
-        CastStorageComputeEx<xpu>(attrs,
-                                  ctx,
-                                  { other_outputs[i] },
-                                  req,
-                                  { outputs[i] });
-      }
-    }
-  }
-
-  /*!
-   * \brief Execute the supplied function/operation, followed by a sparse retain operation
-   * of the lhs argument's rows only (row indices)
-   * \tparam xpu gpu or cpu
-   * \tparam Function Function type call to wrap and return sparse-retained output
-   * \param attrs Operator attributes
-   * \param ctx Operator context
-   * \param inputs Input NDArrays
-   * \param req Operation request
-   * \param outputs Output NDArrays
-   * \param pre_retain Whether to call SparseRetain before calling the given function
-   * \param function Function call to wrap and return sparse-retained output
-   */
-  template <typename xpu, typename Function>
-  static void ScatterWrap(const nnvm::NodeAttrs &attrs,
-                          const OpContext &ctx,
-                          const std::vector<NDArray> &inputs,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<NDArray> &outputs,
-                          bool pre_retain,
-                          Function function) {
-    CHECK_EQ(outputs.size(), 1U);
-    if (inputs[0].storage_type() == kRowSparseStorage
-        && outputs[0].storage_type() == kRowSparseStorage) {
-      if (pre_retain && inputs[1].storage_type() == kRowSparseStorage) {
-        // Retain only rhs rows which have same row as lhs input
-        NDArray retained_input(outputs[0].storage_type(), outputs[0].shape(), outputs[0].ctx());
-        SparseRetainOpForwardEx<xpu>(attrs, ctx,
-                                     { inputs[1], inputs[0].aux_ndarray(rowsparse::kIdx) },
-                                     req,
-                                     {retained_input});
-        CHECK(retained_input.storage_initialized());
-        // Perform the operation
-        function(attrs, ctx, {inputs[0], retained_input}, req, outputs);
-        // Sanity check
-        DCHECK_LE(outputs[0].aux_shape(rowsparse::kIdx).Size(),
-                  inputs[0].aux_shape(rowsparse::kIdx).Size());
-      } else {
-        // Perform the operation as usual
-        NDArray temp_out(outputs[0].storage_type(), outputs[0].shape(), outputs[0].ctx());
-        function(attrs, ctx, inputs, req, { temp_out });
-        CHECK(temp_out.storage_initialized());
-        CHECK_EQ(temp_out.storage_type(), kRowSparseStorage);
-        // Sparse-retain the output based upon lhs-input sparsity
-        const NDArray indices(inputs[0].aux_data(rowsparse::kIdx), inputs[0].ctx().dev_id);
-        SparseRetainOpForwardEx<xpu>(attrs, ctx, { temp_out, indices },
-                                     req, outputs);
-        DCHECK_LE(outputs[0].aux_shape(rowsparse::kIdx).Size(),
-                  inputs[0].aux_shape(rowsparse::kIdx).Size());
-      }
-    } else {
-      function(attrs, ctx, inputs, req, outputs);
-    }
-  }
-};
-
-/*! \brief Scatter elemwise binary op handlers */
-class ElemwiseScatterBinaryOp : public ElemwiseBinaryOp,
-                                public ScatterOpBase {
-  /*! \brief  CPU version, RspRsp knows how to do an efficient scatter,
-   * otherwise retain rhs + normal op */
-  template<typename OP>
-  static void ComputeEx_(mshadow::Stream<cpu> *stream,
-                         const nnvm::NodeAttrs &attrs,
-                         const OpContext &ctx,
-                         const std::vector<NDArray> &inputs,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &outputs) {
-    // row_sparse-op-row_sparse or row_sparse-op-default can call RspRsp
-    const NDArrayStorageType input0_stype = inputs[0].storage_type();
-    const NDArrayStorageType input1_stype = inputs[1].storage_type();
-    if (input0_stype == kRowSparseStorage
-        && (input1_stype == kRowSparseStorage || input1_stype == kDefaultStorage)
-        && outputs[0].storage_type() == kRowSparseStorage) {
-      mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
-      RspRspOp<OP>(s, attrs, ctx, inputs[0], inputs[1], req[0], outputs[0],
-                   false, true, false, true);
-      CHECK_EQ(inputs[0].aux_shape(rowsparse::kIdx).Size(),
-               outputs[0].aux_shape(rowsparse::kIdx).Size());
-    } else {
-      ScatterWrap<cpu>(attrs, ctx, inputs, req,
-                       outputs, true, [input0_stype, input1_stype](const nnvm::NodeAttrs &attrs,
-                                         const OpContext &ctx,
-                                         const std::vector<NDArray> &inputs,
-                                         const std::vector<OpReqType> &req,
-                                         const std::vector<NDArray> &outputs) {
-          if ((input0_stype == kCSRStorage || input1_stype == kCSRStorage)
-              && input0_stype != input1_stype) {
-            // Fallback to dense + retain
-            ComputeAsDense<cpu>(attrs, ctx, inputs, req,
-                                outputs, ElemwiseBinaryOp::Compute<cpu, OP>);
-          } else {
-            // Normal operation + retain
-            ElemwiseBinaryOp::ComputeEx<cpu, OP>(attrs, ctx, inputs, req, outputs);
-          }
-        });
-    }
-  }
-
-#ifdef __CUDACC__
-  /*! \brief GPU version, fallback op + retain */
-  template<typename OP>
-  static void ComputeEx_(mshadow::Stream<gpu> *stream,
-                         const nnvm::NodeAttrs &attrs,
-                         const OpContext &ctx,
-                         const std::vector<NDArray> &inputs,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &outputs) {
-    ScatterWrap<gpu>(attrs, ctx, inputs, req,
-                     outputs, false, [](const nnvm::NodeAttrs &attrs,
-                                        const OpContext &ctx,
-                                        const std::vector<NDArray> &inputs,
-                                        const std::vector<OpReqType> &req,
-                                        const std::vector<NDArray> &outputs) {
-        ComputeAsDense<gpu>(attrs, ctx, inputs, req, outputs, ElemwiseBinaryOp::Compute<gpu, OP>);
-      });
-  }
-#endif  // #ifdef __CUDACC__
-
- public:
-  /*! \brief General compute for operations which include sparse tensors */
-  template<typename xpu, typename OP>
-  static void ComputeEx(const nnvm::NodeAttrs &attrs,
-                        const OpContext &ctx,
-                        const std::vector<NDArray> &inputs,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<NDArray> &outputs) {
-    DCHECK_EQ(inputs.size(), 2U);
-    DCHECK_EQ(outputs.size(), 1U);
-    ComputeEx_<OP>(ctx.get_stream<xpu>(), attrs, ctx, inputs, req, outputs);
-  }
-};
-
-/*! \brief Scatter elemwise binary scalar op handlers */
-class ElemwiseScatterBinaryScalarOp : public BinaryScalarOp,
-                                      public ScatterOpBase {
-  /*! \brief  CPU version, retain rhs + normal op */
-  template<typename OP>
-  static void ComputeEx_(mshadow::Stream<cpu> *stream,
-                         const nnvm::NodeAttrs &attrs,
-                         const OpContext &ctx,
-                         const std::vector<NDArray> &inputs,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &outputs) {
-    ScatterWrap<cpu>(attrs, ctx, inputs, req,
-                     outputs, true, [](const nnvm::NodeAttrs &attrs,
-                                        const OpContext &ctx,
-                                        const std::vector<NDArray> &inputs,
-                                        const std::vector<OpReqType> &req,
-                                        const std::vector<NDArray> &outputs) {
-        // Normal operation + retain
-        BinaryScalarOp::ComputeEx<cpu, OP>(attrs, ctx, inputs, req, outputs);
-    });
-  }
-
-#ifdef __CUDACC__
-  /*! \brief GPU version, fallback op + retain */
-  template<typename OP>
-  static void ComputeEx_(mshadow::Stream<gpu> *stream,
-                         const nnvm::NodeAttrs &attrs,
-                         const OpContext &ctx,
-                         const std::vector<NDArray> &inputs,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &outputs) {
-    CHECK_NE(inputs[0].storage_type(), kDefaultStorage);
-    if (outputs[0].storage_type() == inputs[0].storage_type()) {
-      BinaryScalarOp::ComputeEx<gpu, OP>(attrs, ctx, inputs, req, outputs);
-    } else {
-      ScatterWrap<cpu>(attrs, ctx, inputs, req,
-                       outputs, false, [](const nnvm::NodeAttrs &attrs,
-                                          const OpContext &ctx,
-                                          const std::vector<NDArray> &inputs,
-                                          const std::vector<OpReqType> &req,
-                                          const std::vector<NDArray> &outputs) {
-          // Fallback to dense + retain
-          ComputeAsDense<gpu>(attrs, ctx, inputs, req, outputs, BinaryScalarOp::Compute<gpu, OP>);
-      });
-    }
-  }
-#endif  // __CUDACC__
-
- public:
-  using BinaryScalarOp::Compute;
-
-  /*! \brief General compute for operations which include sparse tensors */
-  template<typename xpu, typename OP>
-  static void ComputeEx(const nnvm::NodeAttrs &attrs,
-                        const OpContext &ctx,
-                        const std::vector<NDArray> &inputs,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<NDArray> &outputs) {
-    DCHECK_EQ(inputs.size(), 1U);
-    DCHECK_EQ(outputs.size(), 1U);
-    CHECK_NE(inputs[0].storage_type(), kDefaultStorage);
-    if (inputs[0].storage_type() == kRowSparseStorage
-        && outputs[0].storage_type() == kRowSparseStorage) {
-      UnaryOp::MapToFCompute<xpu>(attrs, ctx, inputs, req, outputs, Compute<xpu, OP>);
-    } else {
-      ComputeEx_<OP>(ctx.get_stream<xpu>(), attrs, ctx, inputs, req, outputs);
-    }
-  }
-};
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_SCATTER_OP_H_
diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh
index 891cba0c129c..e987cc06ebdf 100644
--- a/src/operator/tensor/elemwise_unary_op.cuh
+++ b/src/operator/tensor/elemwise_unary_op.cuh
@@ -95,44 +95,48 @@ __global__ void unary_kernel(const unary_kernel_params params,
 )code";
 
 struct UnaryRTCCompute {
-
   std::string OP;
 
-void operator()(const nnvm::NodeAttrs& attrs,
-                const OpContext& ctx,
-                const std::vector<TBlob>& inputs,
-                const std::vector<OpReqType>& req,
-                const std::vector<TBlob>& outputs) {
-  using namespace mxnet::common::cuda::rtc;
-  if (req[0] == kNullOp) return;
-  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-
-  const std::string code = std::string("const OpReqType req = ") +
-                           util::to_string(req[0]) +
-                           ";\n" +
-                           "#define OP op::" +
-                           OP +
-                           "\n" +
-                           unary_kernel_fwd;
-  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
-
-  const index_t size = outputs[0].Size();
-  unary_kernel_params params = { {inputs[0].dptr_},
-                                 {outputs[0].dptr_} };
-
-  VectorizedKernelRTCLauncher(code, "unary_kernel", nvec,
-                              size, 1, s, params,
-                              inputs, outputs,
-                              ctx.run_ctx.get_ctx().dev_id);
-}
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+    using namespace mxnet::common::cuda::rtc;
+    if (req[0] == kNullOp) return;
+    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
+
+    const std::string code = std::string("const OpReqType req = ") +
+                             util::to_string(req[0]) +
+                             ";\n" +
+                             "#define OP op::" +
+                             OP +
+                             "\n" +
+                             unary_kernel_fwd;
+    const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+    const index_t size = outputs[0].Size();
+    unary_kernel_params params = { {inputs[0].dptr_},
+                                   {outputs[0].dptr_} };
+
+    VectorizedKernelRTCLauncher(code, "unary_kernel", nvec,
+                                size, 1, s, params,
+                                inputs, outputs,
+                                ctx.run_ctx.get_ctx().dev_id);
+  }
 
-void operator()(const nnvm::NodeAttrs& attrs,
-                const OpContext& ctx,
-                const std::vector<NDArray>& inputs,
-                const std::vector<OpReqType>& req,
-                const std::vector<NDArray>& outputs) {
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<NDArray>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<NDArray>& outputs) {
+    if (req[0] == kNullOp) {
+      return;
+    }
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
     InitStorageGeometry<1, 1>(attrs, inputs, outputs);
     CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
       << "This function works only for sparse types.";
@@ -153,7 +157,7 @@ void operator()(const nnvm::NodeAttrs& attrs,
       }
       this->operator()(attrs, ctx, in_blobs, req, out_blobs);
     }
-}
+  }
 
 };
 
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 26f6829e61a7..27291d22d5fa 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1851,152 +1851,6 @@ def check_broadcast_div(mx_lhs, mx_rhs, np_lhs, np_rhs, dtype):
             check_broadcast_mul(mx_lhs, mx_rhs, np_lhs, np_rhs, np.float32)
             check_broadcast_div(mx_lhs, mx_rhs, np_lhs, np_rhs, np.float32)
 
-@with_seed()
-def test_scatter_ops():
-    def csr_get_seen_points(name, csr_array, verbose=False):
-        """Get a unique list of points int he CSR array as well as a
-        corresponding parallel list of points and values"""
-        seen_points = set()
-        seen_point_list = list()
-        values = list()
-        row_count = csr_array.shape[0]
-        row_pointers = csr_array.indptr.asnumpy()
-        col_indexes  = csr_array.indices.asnumpy()
-        data = csr_array.data.asnumpy()
-        for row in range(row_count):
-            start_pos = row_pointers[row]
-            end_pos = row_pointers[row + 1]
-            for col_index in range(start_pos, end_pos):
-                col = col_indexes[col_index]
-                val = data[col_index]
-                if verbose is True:
-                    print("{}: (row, col = ({}, {}) = {}".format(name, row, col, val))
-                seen_points.add((row, col))
-                seen_point_list.append((row, col))
-                values.append(val)
-        return seen_points, values, seen_point_list
-
-    def check_scatter_ops(name, shape, lhs_stype, rhs_stype, forward_mxnet_call, forward_numpy_call,
-                          density=0.25, rhs_is_scalar=False, verbose=False):
-        lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
-        if rhs_is_scalar is False:
-            rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
-
-        if verbose is True:
-            print(name)
-
-        if lhs_stype != 'default':
-            lhs_nd = create_sparse_array_zd(
-                shape, lhs_stype, density=density,
-                rsp_indices=gen_rsp_random_indices(
-                    shape,
-                    density=density,
-                    force_indices=[(shape[0]/2)]  # force at least one overlap
-                ))
-        else:
-            lhs_nd = rand_ndarray(shape, 'default')
-
-        if rhs_is_scalar is False:
-            if rhs_stype != 'default':
-                rhs_nd = create_sparse_array_zd(
-                    shape, rhs_stype, density=density,
-                    rsp_indices=gen_rsp_random_indices(
-                        shape,
-                        density=density,
-                        force_indices=[(shape[0]/2)]  # force at least one overlap
-                    ))
-            else:
-                rhs_nd = rand_ndarray(shape, 'default')
-        else:
-            rhs_nd = 9
-            rhs = rhs_nd
-
-        lhs_np = lhs_nd.asnumpy()
-        rhs_np = rhs_nd if rhs_is_scalar is True else rhs_nd.asnumpy()
-
-        if verbose is True:
-            print("lhs = {}".format(lhs_np))
-            print("rhs = {}".format(rhs_np))
-
-        out_np = forward_numpy_call(lhs_np, rhs_np)
-
-        if verbose is True:
-            print("Numpy: out_np = {}".format(out_np))
-
-        location = {'lhs': lhs_nd, 'rhs': rhs_nd}
-
-        out = forward_mxnet_call(lhs, rhs)
-        exe_test = out._bind(default_context(), args=location)
-        exe_test.forward(is_train=False)
-        out_nd = exe_test.outputs[0]
-
-        if verbose is True:
-            print("Sym: out_nd = {}".format(out_nd.asnumpy()))
-
-        # For row_sparse, check that rows only exist for rows that are
-        # either int lhs or rhs, and if they exist, they should equal
-        # the numpy values
-        if lhs_stype == 'default':
-            almost_equal(out_nd.asnumpy(), out_np, equal_nan=True)
-        elif lhs_stype == 'row_sparse':
-            seen_rows = set()
-            indices = lhs_nd.indices.asnumpy()
-            for i in range(len(indices)):
-                seen_rows.add(indices[i])
-            assert len(out_nd.indices.asnumpy()) == len(seen_rows)
-            out_nd_np = out_nd.asnumpy()
-            for row in seen_rows:
-                row_nd = out_nd_np[row]
-                row_np = out_np[row]
-                almost_equal(row_nd, row_np, equal_nan=True)
-        elif lhs_stype == 'csr' and rhs_is_scalar is False:
-            almost_equal(out_nd.asnumpy(), out_np, equal_nan=True)
-        else:
-            assert rhs_is_scalar
-            lhs_seen_points, _, _ = csr_get_seen_points("lhs", lhs_nd, verbose)
-            if rhs_is_scalar is False:
-                rhs_seen_points, _, _ = csr_get_seen_points("rhs", rhs_nd, verbose)
-            else:
-                rhs_seen_points = set()
-            input_seen_points = lhs_seen_points.union(rhs_seen_points)
-            out_seen_pounts, out_values, seen_point_list = csr_get_seen_points("out_nd", out_nd, verbose)
-            # Some may have been zero
-            assert len(out_seen_pounts) <= len(input_seen_points)
-            out_nd_np = out_nd.asnumpy()
-            val_index = 0
-            for row_col in seen_point_list:
-                row = row_col[0]
-                col = row_col[1]
-                val = out_values[val_index]
-                val_np = out_nd_np[row, col]
-                almost_equal(val, val_np, equal_nan=True)
-                val_index += 1
-
-    shape = (10, 5)
-
-    for lhs_stype in ['row_sparse', 'default', 'csr']:
-        for rhs_stype in ['row_sparse', 'default', 'csr']:
-            print("op: {}, lhs_stype: {}, rhs_stype: {}".format('_scatter_elemwise_div',
-                                                                lhs_stype, rhs_stype))
-            check_scatter_ops('_scatter_elemwise_div', shape, lhs_stype, rhs_stype,
-                              lambda l, r: mx.sym._internal._scatter_elemwise_div(l, r),
-                              lambda l, r: l / r,
-                              verbose=False)
-
-    for lhs_stype in ['row_sparse', 'default', 'csr']:
-        print("op: {}, lhs_stype: {}".format('_scatter_plus', lhs_stype))
-        check_scatter_ops('_scatter_plus', shape, lhs_stype, 'scalar',
-                          lambda l, r: mx.sym._internal._scatter_plus_scalar(l, r),
-                          lambda l, r: l + r,
-                          rhs_is_scalar=True, verbose=False)
-
-        print("op: {}, lhs_stype: {}".format('_scatter_minus', lhs_stype))
-        check_scatter_ops('_scatter_minus', shape, lhs_stype, 'scalar',
-                          lambda l, r: mx.sym._internal._scatter_minus_scalar(l, r),
-                          lambda l, r: l + r,
-                          rhs_is_scalar=True, verbose=False, density=0.5)
-
-
 @with_seed()
 def test_batchnorm_fallback():
     # same test as test_operator.test_batchnorm_training, but tests fallback logic of batchnorm

From 39f6fee19454f6f9796117065469e80c84dc9bdd Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 1 Jun 2020 13:04:20 -0700
Subject: [PATCH 06/64] Backward of binary scalar

---
 src/common/cuda/rtc/backward_functions-inl.h  |  91 ++++++++-
 src/common/cuda/rtc/vectorization-inl.h       |  32 +--
 src/operator/fusion/fused_op-inl.h            |   6 +-
 src/operator/mshadow_op.h                     |   2 -
 .../np_elemwise_broadcast_op_extended.cc      |   6 +-
 .../np_elemwise_broadcast_op_extended.cu      |  14 +-
 src/operator/operator_tune.cc                 |   1 -
 .../tensor/elemwise_binary_scalar_op.cuh      | 190 +++++++++---------
 .../tensor/elemwise_binary_scalar_op.h        |   9 -
 .../tensor/elemwise_binary_scalar_op_basic.cu |   6 +-
 .../elemwise_binary_scalar_op_extended.cu     |  12 +-
 src/operator/tensor/elemwise_unary_op.cuh     |   1 -
 12 files changed, 217 insertions(+), 153 deletions(-)

diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
index 72e70cb60ded..7e6da8a214d7 100644
--- a/src/common/cuda/rtc/backward_functions-inl.h
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -171,6 +171,86 @@ __device__ inline DTypeGrad backward_square(const DType val, const DTypeGrad gra
   return 2 * val * grad;
 }
 
+template <typename DType, typename DType2>
+__device__ inline DType rdiv_grad(const DType val,
+                                  const DType2 val2) {
+  return -val2 / (val * val);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType mod_grad(const DType val,
+                                 const DType2 val2) {
+  if (type_util::is_integral<DType>::value) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType rmod_grad(const DType val,
+                                  const DType2 val2) {
+  if (type_util::is_integral<DType>::value) {
+    return 0;
+  } else {
+    return -op::floor(val2 / val);
+  }
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType power_grad(const DType val,
+                                   const DType2 val2) {
+  return op::power(val, val2 - 1.f) * val2;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType rpower_grad(const DType val,
+                                   const DType2 val2) {
+  return val * op::log(val2);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType hypot_grad_left(const DType val,
+                                        const DType2 val2) {
+  return val / op::hypot(val, val2);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType hypot_grad_right(const DType val,
+                                         const DType2 val2) {
+  return val2 / op::hypot(val, val2);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType copysign_grad(const DType val,
+                                      const DType2 val2) {
+  return (a >= 0 && b >= 0) || (a < 0 && b < 0) ? 1 : -1;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType arctan2_grad(const DType val,
+                                     const DType2 val2) {
+  return val2 / (val * val + val2 * val2);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType rarctan2_grad(const DType val,
+                                      const DType2 val2) {
+  return val / (val * val + val2 * val2);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType ldexp_grad(const DType val,
+                                   const DType2 val2) {
+  return op::power(static_cast<DType>(2), val2);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType rldexp_grad(const DType val,
+                                    const DType2 val2) {
+  returni val2 * op::power(static_cast<DType>(2), val) * op::log(static_cast<DType>(2));
+}
+
 template <typename DType, typename DTypeGrad>
 __device__ inline DTypeGrad backward_clip(const DType val, const DTypeGrad grad,
                                           const float a_min, const float a_max) {
@@ -196,17 +276,16 @@ __device__ inline DTypeGrad backward_erfinv(const DType val, const DTypeGrad gra
   return 0.5f * op::sqrt(pi) * op::exp(val * val) * grad;
 }
 
-template <typename DType, typename DType2, typename DTypeGrad>
-__device__ inline DTypeGrad backward_smooth_l1(const DType val, const DType2 scalar,
-                                               const DTypeGrad grad) {
+template <typename DType, typename DType2>
+__device__ inline DType smooth_l1_grad(const DType val, const DType2 scalar) {
   auto bsq = scalar * scalar;
   auto ibsq = 1.0f / bsq;
   if (val > ibsq) {
-    return grad;
+    return 1;
   } else if (val < -ibsq) {
-    return -grad;
+    return -1;
   } else {
-    return bsq * val * grad;
+    return bsq * val;
   }
 }
 
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index c3bc6881de3d..34c33b198000 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -298,13 +298,15 @@ Alignment CheckAlignment(const Params& params, const index_t lead_dim,
 
   size_t i = 0;
   for (const void *ptr : params.inputs) {
-    int new_align = CalcAlignment(ptr,
-                                  mshadow_type_info(inputs[i].type_flag_).size * nvec);
-    if (align == -1) {
-      align = new_align;
-    } else {
-      if (align != new_align) {
-        return Alignment::DIFFERENT;
+    if (ptr != nullptr) {
+      int new_align = CalcAlignment(ptr,
+                                    mshadow_type_info(inputs[i].type_flag_).size * nvec);
+      if (align == -1) {
+        align = new_align;
+      } else {
+        if (align != new_align) {
+          return Alignment::DIFFERENT;
+        }
       }
     }
     ++i;
@@ -312,13 +314,15 @@ Alignment CheckAlignment(const Params& params, const index_t lead_dim,
 
   i = 0;
   for (const void *ptr : params.outputs) {
-    int new_align = CalcAlignment(ptr,
-                                  mshadow_type_info(outputs[i].type_flag_).size * nvec);
-    if (align == -1) {
-      align = new_align;
-    } else {
-      if (align != new_align) {
-        return Alignment::DIFFERENT;
+    if (ptr != nullptr) {
+      int new_align = CalcAlignment(ptr,
+                                    mshadow_type_info(outputs[i].type_flag_).size * nvec);
+      if (align == -1) {
+        align = new_align;
+      } else {
+        if (align != new_align) {
+          return Alignment::DIFFERENT;
+        }
       }
     }
     ++i;
diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h
index 066f54c0744e..e4c9ec9d5825 100644
--- a/src/operator/fusion/fused_op-inl.h
+++ b/src/operator/fusion/fused_op-inl.h
@@ -159,8 +159,8 @@ const std::map<std::string, std::vector<std::vector<std::string>>> ops_desc = {
   {"_backward_square"                  , {{"op::backward_square(%, %)", "_1", "_0"}}},
   {"_backward_div_scalar"              , {{"(% * 1.0f/float(%))", "_0", "scalar"}}},
   {"_backward_div_scalar"              , {{"(% * 1.0f/float(%))", "_0", "scalar"}}},
-  {"_backward_rdiv_scalar"             , {{"(-% * float(%) / (% * %))", "_0",
-                                           "scalar", "_1", "_1"}}},
+  {"_backward_rdiv_scalar"             , {{"(op::rdiv_grad(%, %) * %", "_1",
+                                           "scalar", "_0"}}},
   {"_backward_hypot_scalar"            , {{"(% * % / op::hypot(%, float(%)))",
                                            "_0", "_1", "_1", "scalar"}}},
   {"_backward_radians"                 , {{"op::radians(%)", "_0"}}},
@@ -173,7 +173,7 @@ const std::map<std::string, std::vector<std::vector<std::string>>> ops_desc = {
   {"_backward_clip"                    , {{"op::backward_clip(%, %, %, %)", "_1", "_0",
                                                                             "a_min", "a_max"}}},
   {"smooth_l1"                         , {{"op::smooth_l1(%, float(%))", "_0", "scalar"}}},
-  {"_backward_smooth_l1"               , {{"op::backward_smooth_l1(%, float(%), %)",
+  {"_backward_smooth_l1"               , {{"op::smooth_l1_grad(%, float(%)) * %",
                                            "_1", "scalar", "_0"}}},
   // TODO(ptredak): arange
   // TODO(ptredak): LeakyRelu
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index cb94b69c6bae..f4266ff731df 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -760,8 +760,6 @@ MXNET_BINARY_MATH_OP(copysign_rgrad, 0);
 
 MXNET_BINARY_MATH_OP(rcopysign, (b >= 0 && a >= 0) || (b < 0 && a < 0) ? b : -b);
 
-MXNET_BINARY_MATH_OP(rcopysign_grad, 0);
-
 struct mod : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static typename enable_if<!is_unsigned<DType>::value, DType>::type
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cc b/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
index ce7f59a5520f..90a48d4aee9f 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
@@ -201,16 +201,12 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_copysign_scalar)
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rcopysign_scalar)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rcopysign>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_rcopysign_scalar"});
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_backward_npi_copysign_scalar)
 .set_attr<FCompute>("FCompute<cpu>",
                     BinaryScalarOp::Backward<cpu, mshadow_op::copysign_grad>);
 
-MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_backward_npi_rcopysign_scalar)
-.set_attr<FCompute>("FCompute<cpu>",
-                    BinaryScalarOp::Backward<cpu, mshadow_op::rcopysign_grad>);
-
 inline bool Arctan2OpType(const nnvm::NodeAttrs& attrs,
                           std::vector<int>* in_attrs,
                           std::vector<int>* out_attrs) {
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
index 9fdbac6b65fa..61919da85aa7 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
@@ -67,23 +67,19 @@ NNVM_REGISTER_OP(_npi_rcopysign_scalar)
 
 NNVM_REGISTER_OP(_backward_npi_copysign_scalar)
 .set_attr<FCompute>("FCompute<gpu>",
-                    BinaryScalarOp::Backward<gpu, mshadow_op::copysign_grad>);
-
-NNVM_REGISTER_OP(_backward_npi_rcopysign_scalar)
-.set_attr<FCompute>("FCompute<gpu>",
-                    BinaryScalarOp::Backward<gpu, mshadow_op::rcopysign_grad>);
+                    BinaryScalarRTCBackward{"copysign_grad"});
 
 NNVM_REGISTER_OP(_npi_arctan2_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"arctan2"});
 
 NNVM_REGISTER_OP(_backward_npi_arctan2_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::arctan2_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"arctan2_grad"});
 
 NNVM_REGISTER_OP(_npi_rarctan2_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rarctan2"});
 
 NNVM_REGISTER_OP(_backward_npi_rarctan2_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rarctan2_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"rarctan2_grad"});
 
 NNVM_REGISTER_OP(_npi_lcm_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"lcm"});
@@ -111,10 +107,10 @@ NNVM_REGISTER_OP(_backward_npi_ldexp)
                                                                   mshadow_op::ldexp_rgrad>);
 
 NNVM_REGISTER_OP(_backward_npi_ldexp_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::ldexp_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"ldexp_grad"});
 
 NNVM_REGISTER_OP(_backward_npi_rldexp_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rldexp_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"rldexp_grad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index b5e253a1872e..61e8959ac777 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -368,7 +368,6 @@ IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::copysign);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rcopysign);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::copysign_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::copysign_rgrad);  // NOLINT()
-IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rcopysign_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arctan2);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rarctan2);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arctan2_grad);  // NOLINT()
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh
index ac18443bb550..4ab4caa144e3 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cuh
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh
@@ -28,7 +28,6 @@
 
 #include <cuda_runtime.h>
 #include "../operator_common.h"
-#include "../../common/cuda/vectorization.cuh"
 #include "elemwise_unary_op.h"
 
 #include <vector>
@@ -38,71 +37,8 @@
 namespace mxnet {
 namespace op {
 
-namespace binary_scalar {
-
-using common::cuda::VectorizedKernelLauncher;
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
-
-template <typename DType, int NumInputs, int NumOutputs>
-struct VectorizedKernelParams {
-  const DType* inputs[NumInputs];
-  DType* outputs[NumOutputs];
-  DType scalar;
-};
-
-template <bool aligned, typename DType, typename LType, typename OP, int req>
-__global__ void VectorizedBinaryScalarKernelBwd(const VectorizedKernelParams<DType, 2, 1> params,
-                                                const index_t N) {
-  VectorizedLoader<DType, LType, aligned> ograd_loader(params.inputs[0], N);
-  VectorizedLoader<DType, LType, aligned> input_loader(params.inputs[1], N);
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
-
-  const index_t M = ograd_loader.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    ograd_loader.load(tid, N);
-    input_loader.load(tid, N);
-    if (req == kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < ograd_loader.nvec(); ++i) {
-      DType ograd = ograd_loader.separate()[i];
-      DType temp = ograd * OP::Map(input_loader.separate()[i],
-                                   params.scalar);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-template <typename DType, typename OP, int req>
-class VectorizedBinaryScalarBwd {
- public:
-  using ParamType = VectorizedKernelParams<DType, 2, 1>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryScalarKernelBwd<aligned, DType, LType, OP, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-}  // namespace binary_scalar
-
 struct binary_scalar_kernel_params {
-  const void *inputs[1];
+  const void *inputs[2];
   void *outputs[1];
   double scalar;
 };
@@ -110,7 +46,7 @@ struct binary_scalar_kernel_params {
 const char binary_scalar_kernel_fwd[] = R"code(
 
 struct binary_scalar_kernel_params {
-  const void *inputs[1];
+  const void *inputs[2];
   void *outputs[1];
   double scalar;
 };
@@ -185,7 +121,7 @@ struct BinaryScalarRTCCompute {
     const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
 
     const index_t size = outputs[0].Size();
-    binary_scalar_kernel_params params = { {inputs[0].dptr_},
+    binary_scalar_kernel_params params = { {inputs[0].dptr_, nullptr},
                                            {outputs[0].dptr_},
                                            alpha };
 
@@ -228,35 +164,101 @@ struct BinaryScalarRTCCompute {
   }
 };
 
-template <typename OP>
-void BinaryScalarOp::Backward_(const nnvm::NodeAttrs &attrs,
-                               mshadow::Stream<gpu>* s,
-                               const std::vector<TBlob> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<TBlob> &outputs) {
-  using namespace binary_scalar;
-  if (req[0] == kNullOp) return;
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
-  const double alpha = param.scalar;
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint4;
-      using Kernel = VectorizedBinaryScalarBwd<DType, OP, Req>;
-
-      const index_t size = outputs[0].Size();
-      typename Kernel::ParamType params;
-      params.inputs[0] = inputs[0].dptr<DType>();
-      params.inputs[1] = inputs[1].dptr<DType>();
-      params.outputs[0] = outputs[0].dptr<DType>();
-      params.scalar = (DType)alpha;
-
-      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-    });
-  });
+const char binary_scalar_kernel_bwd[] = R"code(
+
+struct binary_scalar_kernel_params {
+  const void *inputs[2];
+  void *outputs[1];
+  double scalar;
+};
+
+__global__ void binary_scalar_kernel_bwd(const binary_scalar_kernel_params params,
+                                         const index_t lead_dim,
+                                         const index_t other_dim,
+                                         const index_t N,
+                                         const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> ograd_loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedLoader<InputType1, nvec, aligned> input_loader(
+    reinterpret_cast<const InputType1*>(params.inputs[1]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using GType = AccType<InputType0>;
+  using IType = AccType<InputType1>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    ograd_loader.load(tid, N);
+    input_loader.load(tid, N);
+    if (req == OpReqType::kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto ograd = GType::from(ograd_loader.separate()[i]);
+      const auto input = IType::from(input_loader.separate()[i]);
+      // enables returning different type
+      const auto temp = op::mul(grad,
+                                OP(input, static_cast<typename IType::type>(params.scalar)));
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(tid, N);
+  }
 }
 
+)code";
+
+struct BinaryScalarRTCBackward {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+    using namespace mxnet::common::cuda::rtc;
+    if (req[0] == kNullOp) return;
+    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+    CHECK_EQ(inputs.size(), 2U);
+    CHECK_EQ(outputs.size(), 1U);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
+
+    const std::string code = std::string("const OpReqType req = ") +
+                             util::to_string(req[0]) +
+                             ";\n" +
+                             "#define OP op::" +
+                             OP +
+                             "\n" +
+                             binary_scalar_kernel_bwd;
+    const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+    const index_t size = outputs[0].Size();
+    binary_scalar_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_},
+                                           {outputs[0].dptr_},
+                                           alpha };
+
+    VectorizedKernelRTCLauncher(code, "binary_scalar_kernel_bwd", nvec,
+                                size, 1, s, params,
+                                inputs, outputs,
+                                ctx.run_ctx.get_ctx().dev_id);
+  }
+};
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index be9adee35fb9..5e95df7aa9db 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -432,15 +432,6 @@ class BinaryScalarOp : public UnaryOp {
     });
   }
 
-#if MXNET_USE_CUDA
-  template<typename OP>
-  static void Backward_(const nnvm::NodeAttrs &attrs,
-                        mshadow::Stream<gpu>* s,
-                        const std::vector<TBlob> &inputs,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &outputs);
-#endif
-
   template<typename xpu, typename OP>
   static void Backward(const nnvm::NodeAttrs &attrs,
                        const OpContext &ctx,
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
index 1425702459db..9635e83b4453 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
@@ -57,19 +57,19 @@ NNVM_REGISTER_OP(_rdiv_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rdiv"});
 
 NNVM_REGISTER_OP(_backward_rdiv_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rdiv_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"rdiv_grad"});
 
 NNVM_REGISTER_OP(_mod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_mod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::mod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"mod_grad"});
 
 NNVM_REGISTER_OP(_rmod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rmod"});
 
 NNVM_REGISTER_OP(_backward_rmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rmod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"rmod_grad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
index 9bb047569bf7..84f188d3d44f 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
@@ -32,37 +32,37 @@ NNVM_REGISTER_OP(_maximum_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"max"});
 
 NNVM_REGISTER_OP(_backward_maximum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::ge>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"greater_equal"});
 
 NNVM_REGISTER_OP(_minimum_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"min"});
 
 NNVM_REGISTER_OP(_backward_minimum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::le>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"lesser_equal"});
 
 NNVM_REGISTER_OP(_power_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_power_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::power_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"power_grad"});
 
 NNVM_REGISTER_OP(_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rpow"});
 
 NNVM_REGISTER_OP(_backward_rpower_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rpower_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"rpower_grad"});
 
 NNVM_REGISTER_OP(_hypot_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"hypot"});
 
 NNVM_REGISTER_OP(_backward_hypot_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::hypot_grad_left>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"hypot_grad_left"});
 
 NNVM_REGISTER_OP(smooth_l1)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"smooth_l1"});
 
 NNVM_REGISTER_OP(_backward_smooth_l1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::smooth_l1_gradient>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"smooth_l1_grad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh
index e987cc06ebdf..68dbbbf5d64f 100644
--- a/src/operator/tensor/elemwise_unary_op.cuh
+++ b/src/operator/tensor/elemwise_unary_op.cuh
@@ -28,7 +28,6 @@
 
 #include <cuda_runtime.h>
 #include "../operator_common.h"
-#include "../../common/cuda/vectorization.cuh"
 #include "../../common/cuda/rtc/vectorization-inl.h"
 #include "../../common/cuda/rtc.h"
 

From fa732a52bbaeccfff77e91a4b3b031b8725753f0 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 1 Jun 2020 14:21:48 -0700
Subject: [PATCH 07/64] Binary forward

---
 src/operator/tensor/elemwise_binary_op.cuh    | 95 +++++++++++++++++++
 .../tensor/elemwise_binary_op_basic.cu        | 12 +--
 .../tensor/elemwise_binary_op_extended.cu     |  9 +-
 .../tensor/elemwise_binary_op_logic.cu        | 19 ++--
 4 files changed, 114 insertions(+), 21 deletions(-)

diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh
index f2e922e079ac..e9fcd2a09798 100644
--- a/src/operator/tensor/elemwise_binary_op.cuh
+++ b/src/operator/tensor/elemwise_binary_op.cuh
@@ -29,6 +29,7 @@
 #include <cuda_runtime.h>
 #include "../operator_common.h"
 #include "../../common/cuda/vectorization.cuh"
+#include "../../common/cuda/rtc.h"
 
 #include <vector>
 
@@ -256,6 +257,100 @@ void ElemwiseBinaryOp::Compute_(const nnvm::NodeAttrs &attrs,
   });
 }
 
+struct binary_kernel_params {
+  const void *inputs[2];
+  void *outputs[1];
+};
+
+const char binary_kernel_fwd[] = R"code(
+
+struct binary_kernel_params {
+  const void *inputs[2];
+  void *outputs[1];
+};
+
+__global__ void binary_kernel(const binary_kernel_params params,
+                              const index_t lead_dim,
+                              const index_t other_dim,
+                              const index_t N,
+                              const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> loader0(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedLoader<InputType1, nvec, aligned> loader1(
+    reinterpret_cast<const InputType1*>(params.inputs[1]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using IType0 = AccType<InputType0>;
+  using IType1 = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader0.load(tid, N);
+    loader1.load(tid, N);
+    if (req == OpReqType::kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto input0 = IType0::from(loader0.separate()[i]);
+      const auto input1 = IType1::from(loader1.separate()[i]);
+      const auto temp = OP(input0, input1);  // enables returning different type
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+)code";
+
+struct ElemwiseBinaryRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+    using namespace mxnet::common::cuda::rtc;
+    if (req[0] == kNullOp) return;
+    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+    CHECK_EQ(inputs.size(), 2U);
+    CHECK_EQ(outputs.size(), 1U);
+
+    const std::string code = std::string("const OpReqType req = ") +
+                             util::to_string(req[0]) +
+                             ";\n" +
+                             "#define OP op::" +
+                             OP +
+                             "\n" +
+                             binary_kernel_fwd;
+    const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+    const index_t size = outputs[0].Size();
+    binary_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_},
+                                    {outputs[0].dptr_} };
+
+    VectorizedKernelRTCLauncher(code, "binary_kernel", nvec,
+                                size, 1, s, params,
+                                inputs, outputs,
+                                ctx.run_ctx.get_ctx().dev_id);
+  }
+};
+
 template<typename LOP, typename ROP>
 void ElemwiseBinaryOp::BackwardUseNone_(const nnvm::NodeAttrs &attrs,
                                         mshadow::Stream<gpu>* s,
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index b21b08d03217..2c49865881ff 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -218,11 +218,11 @@ void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream<gpu> *s,
 }
 
 NNVM_REGISTER_OP(elemwise_add)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::plus>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"add"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseBinaryOp::ComputeEx<gpu, op::mshadow_op::plus>);
 
 NNVM_REGISTER_OP(_grad_add)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"add"});
 
 NNVM_REGISTER_OP(_backward_add)
 .set_attr<FCompute>("FCompute<gpu>",
@@ -230,7 +230,7 @@ NNVM_REGISTER_OP(_backward_add)
                     mshadow_op::identity>);
 
 NNVM_REGISTER_OP(elemwise_sub)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::minus>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"sub"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseBinaryOp::ComputeEx<gpu, op::mshadow_op::minus>);
 
 NNVM_REGISTER_OP(_backward_sub)
@@ -239,7 +239,7 @@ NNVM_REGISTER_OP(_backward_sub)
                     mshadow_op::negation>);
 
 NNVM_REGISTER_OP(elemwise_mul)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::mul>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"mul"})
 .set_attr<FComputeEx>("FComputeEx<gpu>",
   ElemwiseBinaryOp::ComputeDnsLRValueEx<gpu, op::mshadow_op::mul, true, true>);
 
@@ -250,7 +250,7 @@ NNVM_REGISTER_OP(_backward_mul)
 
 NNVM_REGISTER_OP(elemwise_div)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::div>);
+                    ElemwiseBinaryRTCCompute{"div"});
 
 NNVM_REGISTER_OP(_backward_div)
 .set_attr<FCompute>("FCompute<gpu>",
@@ -258,7 +258,7 @@ NNVM_REGISTER_OP(_backward_div)
                     mshadow_op::div_rgrad>);
 
 NNVM_REGISTER_OP(_mod)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::mod>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_mod)
 .set_attr<FCompute>("FCompute<gpu>",
diff --git a/src/operator/tensor/elemwise_binary_op_extended.cu b/src/operator/tensor/elemwise_binary_op_extended.cu
index 0ae6ac966a2b..2bf608d53660 100644
--- a/src/operator/tensor/elemwise_binary_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_op_extended.cu
@@ -22,34 +22,33 @@
  * \file elemwise_binary_op_extended.cu
  * \brief GPU Implementation of binary function.
  */
-#include "./elemwise_unary_op.h"
 #include "./elemwise_binary_op.h"
 
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(_power)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::power>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_power)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu,
   mshadow_op::power_grad, mshadow_op::power_rgrad>);
 
 NNVM_REGISTER_OP(_maximum)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::maximum>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"max"});
 
 NNVM_REGISTER_OP(_backward_maximum)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::ge,
   mshadow_op::lt>);
 
 NNVM_REGISTER_OP(_minimum)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::minimum>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"min"});
 
 NNVM_REGISTER_OP(_backward_minimum)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::le,
   mshadow_op::gt>);
 
 NNVM_REGISTER_OP(_hypot)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::hypot>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"hypot"});
 
 NNVM_REGISTER_OP(_backward_hypot)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu,
diff --git a/src/operator/tensor/elemwise_binary_op_logic.cu b/src/operator/tensor/elemwise_binary_op_logic.cu
index e36e6971148f..8ef84130c5e5 100644
--- a/src/operator/tensor/elemwise_binary_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_op_logic.cu
@@ -22,37 +22,36 @@
  * \file elemwise_binary_op_logic.cu
  * \brief GPU Implementation of unary function.
  */
-#include "./elemwise_unary_op.h"
 #include "./elemwise_binary_op.h"
 
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(_equal)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::eq>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"equal"});
 
 NNVM_REGISTER_OP(_not_equal)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::ne>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"not_equal"});
 
 NNVM_REGISTER_OP(_greater)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"greater"});
 
 NNVM_REGISTER_OP(_greater_equal)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::ge>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"greater_equal"});
 
 NNVM_REGISTER_OP(_lesser)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"less"});
 
 NNVM_REGISTER_OP(_lesser_equal)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::le>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"less_equal"});
 
 NNVM_REGISTER_OP(_logical_and)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::logical_and>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"logical_and"});
 
 NNVM_REGISTER_OP(_logical_or)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::logical_or>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"logical_or"});
 
 NNVM_REGISTER_OP(_logical_xor)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::logical_xor>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"logical_xor"});
 
 }  // namespace op
 }  // namespace mxnet

From a2933f839840ed9fe5662ee3057115e8641bf728 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 1 Jun 2020 16:30:31 -0700
Subject: [PATCH 08/64] Fix for binary_scalar

---
 src/operator/tensor/elemwise_binary_scalar_op.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh
index 4ab4caa144e3..886909693795 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cuh
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cuh
@@ -204,7 +204,7 @@ __global__ void binary_scalar_kernel_bwd(const binary_scalar_kernel_params param
       const auto ograd = GType::from(ograd_loader.separate()[i]);
       const auto input = IType::from(input_loader.separate()[i]);
       // enables returning different type
-      const auto temp = op::mul(grad,
+      const auto temp = op::mul(ograd,
                                 OP(input, static_cast<typename IType::type>(params.scalar)));
 
       if (req == OpReqType::kAddTo) {

From 479c0f82d5c5743026e873e5efffc90418f1e3be Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 2 Jun 2020 10:32:08 -0700
Subject: [PATCH 09/64] Moving all binary forward to RTC

Reorganization
---
 src/common/cuda/rtc.cc                        |   3 +
 src/common/cuda/rtc/backward_functions-inl.h  | 100 +++++--
 src/common/cuda/rtc/special_functions-inl.h   | 282 ++++++++++++++++++
 src/operator/fusion/fused_op-inl.h            |  62 ++--
 src/operator/fusion/fused_op.cu               |  10 +-
 src/operator/nn/group_norm-inl.h              |  18 +-
 src/operator/nn/layer_norm-inl.h              |  18 +-
 .../numpy/np_elemwise_unary_op_basic.cc       |   2 +-
 src/operator/operator_tune.cc                 |   1 -
 src/operator/tensor/elemwise_binary_op.cc     |  99 ++++++
 src/operator/tensor/elemwise_binary_op.cuh    | 168 -----------
 src/operator/tensor/elemwise_binary_op.h      |  51 ++--
 .../tensor/elemwise_binary_op_basic.cu        |   1 +
 .../tensor/elemwise_binary_scalar_op.cc       | 249 ++++++++++++++++
 .../tensor/elemwise_binary_scalar_op.cuh      | 266 -----------------
 .../tensor/elemwise_binary_scalar_op.h        |  33 +-
 .../elemwise_binary_scalar_op_extended.cu     |   2 +-
 src/operator/tensor/elemwise_unary_op.cc      | 152 ++++++++++
 src/operator/tensor/elemwise_unary_op.cuh     | 167 -----------
 src/operator/tensor/elemwise_unary_op.h       |  25 +-
 .../tensor/elemwise_unary_op_basic.cc         |   4 +-
 .../tensor/elemwise_unary_op_basic.cu         |  25 +-
 .../tensor/elemwise_unary_op_logexp.cu        |  15 +-
 src/operator/tensor/elemwise_unary_op_pow.cu  |  16 +-
 src/operator/tensor/elemwise_unary_op_trig.cu |  42 +--
 25 files changed, 1027 insertions(+), 784 deletions(-)
 create mode 100644 src/common/cuda/rtc/special_functions-inl.h
 create mode 100644 src/operator/tensor/elemwise_binary_scalar_op.cc
 delete mode 100644 src/operator/tensor/elemwise_binary_scalar_op.cuh
 create mode 100644 src/operator/tensor/elemwise_unary_op.cc
 delete mode 100644 src/operator/tensor/elemwise_unary_op.cuh

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 5900b31489da..24f21d59c794 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -35,6 +35,7 @@
 #include "rtc/forward_functions-inl.h"
 #include "rtc/backward_functions-inl.h"
 #include "rtc/vectorization-inl.h"
+#include "rtc/special_functions-inl.h"
 #include "utils.h"
 
 
@@ -114,6 +115,8 @@ CUfunction get_function(const std::string &code,
         std::string(fp16_support_string) + "\n" +
         type_support_string + "\n" +
         op_req_type_string + "\n" +
+        float_limits() +
+        special_functions_definitions + '\n' +
         function_definitions + "\n" +
         backward_function_definitions + "\n" +
         vectorization_support_string + "\n";
diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
index 7e6da8a214d7..44d39eeb17ba 100644
--- a/src/common/cuda/rtc/backward_functions-inl.h
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -32,142 +32,158 @@ const char backward_function_definitions[] = R"code(
 namespace op {
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_relu(const DType val, const DTypeGrad grad) {
-  return (isnan(val) || val > 0) ? grad : 0;
+__device__ inline DTypeGrad backward_relu(const DTypeGrad grad, const DType val) {
+  if (isnan(val)) return val;
+  return val > 0 ? grad : 0;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sigmoid(const DType out, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_sigmoid(const DTypeGrad grad, const DType out) {
   return grad * out * (1 - out);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_softrelu(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_softrelu(const DTypeGrad grad, const DType val) {
   return grad * sigmoid(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_softsign(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_softsign(const DTypeGrad grad, const DType val) {
   const DType ap1 = 1 + op::abs(val);
   return grad / (ap1 * ap1);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_exp(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_abs(const DTypeGrad grad, const DType val) {
+  return grad * op::sign(val);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_exp(const DTypeGrad grad, const DType val) {
   return grad * op::exp(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_expm1(const DType val, const DTypeGrad grad) {
-  return backward_exp(val, grad);
+__device__ inline DTypeGrad backward_expm1(const DTypeGrad grad, const DType val) {
+  return backward_exp(grad, val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_log(const DTypeGrad grad, const DType val) {
   return grad / val;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log10(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_log10(const DTypeGrad grad, const DType val) {
   return grad / (val * op::log(static_cast<DTypeGrad>(10)));
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log2(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_log2(const DTypeGrad grad, const DType val) {
   return grad / (val * op::log(static_cast<DTypeGrad>(2)));
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log1p(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_log1p(const DTypeGrad grad, const DType val) {
   return grad / (1 + val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sin(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_sin(const DTypeGrad grad, const DType val) {
   return grad * op::cos(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cos(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_cos(const DTypeGrad grad, const DType val) {
   return -grad * op::sin(val);
 }
 
 // Uses output from tan
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_tan(const DType out, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_tan(const DTypeGrad grad, const DType out) {
   return grad * (out * out + 1);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arcsin(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_arcsin(const DTypeGrad grad, const DType val) {
   return grad / op::sqrt(1 - val*val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arccos(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_arccos(const DTypeGrad grad, const DType val) {
   return -grad / op::sqrt(1 - val*val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arctan(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_arctan(const DTypeGrad grad, const DType val) {
   return grad / (1 + val*val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sinh(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_degrees(const DTypeGrad grad, const DType /* val */) {
+  return op::degrees(grad);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_radians(const DTypeGrad grad, const DType /* val */) {
+  return op::radians(grad);
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_sinh(const DTypeGrad grad, const DType val) {
   return grad * op::cosh(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cosh(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_cosh(const DTypeGrad grad, const DType val) {
   return grad * op::sinh(val);
 }
 
 // Uses tanh output
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_tanh(const DType out, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_tanh(const DTypeGrad grad, const DType out) {
   return grad * (1 - out * out);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arcsinh(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_arcsinh(const DTypeGrad grad, const DType val) {
   return grad / op::sqrt(val * val + 1);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arccosh(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_arccosh(const DTypeGrad grad, const DType val) {
   return grad / op::sqrt(val * val - 1);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arctanh(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_arctanh(const DTypeGrad grad, const DType val) {
   return grad / (1 - val * val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sqrt(const DType out, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_sqrt(const DTypeGrad grad, const DType out) {
   return 0.5 * grad / out;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_rsqrt(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_rsqrt(const DTypeGrad grad, const DType val) {
   const DType inv = 1 / val;
   return -0.5 * grad * op::sqrt(inv) * inv;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cbrt(const DType out, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_cbrt(const DTypeGrad grad, const DType out) {
   return grad / (3.0f * out * out);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_rcbrt(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_rcbrt(const DTypeGrad grad, const DType val) {
   const DType inv = 1 / val;
   return -1.f/3.f * grad * op::cbrt(inv) * inv;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_square(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_square(const DTypeGrad grad, const DType val) {
   return 2 * val * grad;
 }
 
@@ -252,7 +268,7 @@ __device__ inline DType rldexp_grad(const DType val,
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_clip(const DType val, const DTypeGrad grad,
+__device__ inline DTypeGrad backward_clip(const DTypeGrad grad, const DType val,
                                           const float a_min, const float a_max) {
   if (val > a_max || val < a_min) {
     return 0;
@@ -262,20 +278,38 @@ __device__ inline DTypeGrad backward_clip(const DType val, const DTypeGrad grad,
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_reciprocal(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_reciprocal(const DTypeGrad grad, const DType val) {
   return -grad / (val * val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_erf(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_erf(const DTypeGrad grad, const DType val) {
   return 2.0f / op::sqrt(pi) * op::exp(-(val*val)) * grad;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_erfinv(const DType val, const DTypeGrad grad) {
+__device__ inline DTypeGrad backward_erfinv(const DTypeGrad grad, const DType val) {
   return 0.5f * op::sqrt(pi) * op::exp(val * val) * grad;
 }
 
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_gamma(const DTypeGrad grad, const DType val) {
+  if (type_util::is_same<DTypeGrad, double>::value) {
+    return grad * op::gamma(val) * op::special_functions::cephes::psi<double>(val);
+  } else {
+    return grad * op::gamma(val) * op::special_functions::cephes::psi<float>(val);
+  }
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline DTypeGrad backward_gammaln(const DTypeGrad grad, const DType val) {
+  if (type_util::is_same<DTypeGrad, double>::value) {
+    return grad * op::special_functions::cephes::psi<double>(val);
+  } else {
+    return grad * op::special_functions::cephes::psi<float>(val);
+  }
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType smooth_l1_grad(const DType val, const DType2 scalar) {
   auto bsq = scalar * scalar;
diff --git a/src/common/cuda/rtc/special_functions-inl.h b/src/common/cuda/rtc/special_functions-inl.h
new file mode 100644
index 000000000000..cec9d7891199
--- /dev/null
+++ b/src/common/cuda/rtc/special_functions-inl.h
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_COMMON_CUDA_RTC_SPECIAL_FUNCTIONS_INL_H_
+#define MXNET_COMMON_CUDA_RTC_SPECIAL_FUNCTIONS_INL_H_
+
+#include <cfloat>
+#include <string>
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+const std::string float_limits() {
+  return std::string("constexpr double DBL_MAX = ") +
+         std::to_string(DBL_MAX) +
+         ";\n" +
+         "constexpr float FLT_MAX = " +
+         std::to_string(FLT_MAX) +
+         ";\n";
+}
+
+// This code is based on the Cephes Library availible at http://www.netlib.org/cephes
+// The original author, Stephen Moshier, has kindly given permission to use this code
+// in mxnet.  (See email below).
+//
+//     Date: Tue, 13 Sep 2016 09:28:20 -0400
+//     From: Stephen Moshier
+//     To: Flunkert, Valentin
+//     Subject: Re: cephes code in mxnet
+//
+//     Hello Valentin,
+//
+//     Thank you for writing.  You are welcome to use and modify the Cephes code
+//     and distribute it under the Apache license.
+//
+//     Good luck with your project,
+//     Steve Moshier
+//
+// Cephes Math Library Release 2.2:  June, 1992
+// Copyright 1984, 1987, 1992 by Stephen L. Moshier
+// Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+//
+const char special_functions_definitions[] = R"code(
+
+namespace op {
+
+namespace special_functions {
+
+template<typename DType>
+struct helper_numeric_limits {
+  __device__ inline static DType max();
+};
+
+template<>
+struct helper_numeric_limits<double> {
+  __device__ inline static double max() {
+    return DBL_MAX;
+  }
+};
+
+template<>
+struct helper_numeric_limits<float> {
+  __device__ inline static double max() {
+    return FLT_MAX;
+  }
+};
+
+
+struct cephes {
+  /*
+   * Helper to evaluate a polynomial given an array of coefficients.
+   */
+  template <typename DType>
+  __device__ inline static DType polevl(DType x, const DType coef[], int N) {
+    DType ans;
+    DType const *p;
+    int i;
+
+    p = coef;
+    ans = *p++;
+
+    i = N;
+    do {
+      ans = ans * x  +  *p++;
+    } while ( --i );
+
+    return( ans );
+  }
+
+
+  /*
+   * Helper function for psi that handles double/float specific differences
+   * in the algorithm.
+   */
+  template<typename DType>
+  __device__ inline static DType psi_helper(DType s);
+
+  /*
+   *
+   *	Psi (digamma) function
+   *
+   *
+   * SYNOPSIS:
+   *
+   * float x, y, psif();
+   *
+   * y = psif( x );
+   *
+   *
+   * DESCRIPTION:
+   *
+   *              d      -
+   *   psi(x)  =  -- ln | (x)
+   *              dx
+   *
+   * is the logarithmic derivative of the gamma function.
+   * For integer x,
+   *                   n-1
+   *                    -
+   * psi(n) = -EUL  +   >  1/k.
+   *                    -
+   *                   k=1
+   *
+   * This formula is used for 0 < n <= 10.  If x is negative, it
+   * is transformed to a positive argument by the reflection
+   * formula  psi(1-x) = psi(x) + pi cot(pi x).
+   * For general positive x, the argument is made greater than 10
+   * using the recurrence  psi(x+1) = psi(x) + 1/x.
+   * Then the following asymptotic expansion is applied:
+   *
+   *                           inf.   B
+   *                            -      2k
+   * psi(x) = log(x) - 1/2x -   >   -------
+   *                            -        2k
+   *                           k=1   2k x
+   *
+   * where the B2k are Bernoulli numbers.
+   *
+   * ACCURACY:
+   *    Absolute error,  relative when |psi| > 1 :
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      -33,0        30000      8.2e-7      1.2e-7
+   *    IEEE      0,33        100000      7.3e-7      7.7e-8
+   *
+   * ERROR MESSAGES:
+   *     message         condition      value returned
+   * psi singularity    x integer <=0      MAXNUMF
+   */
+  template<typename DType>
+  __device__ inline static DType psi(DType x) {
+    DType p, q, nz, s, w, y;
+    int i, n, negative;
+
+    DType EUL(0.57721566490153286061);
+    DType PI(3.14159265358979323846);
+
+    negative = 0;
+    nz = 0.0;
+
+    if ( x <= 0.0 ) {
+      negative = 1;
+      q = x;
+      p = ::floor(q);
+      if ( p == q ) {
+        return helper_numeric_limits<double>::max();
+      }
+      /* Remove the zeros of tan(PI x)
+       * by subtracting the nearest integer from x
+       */
+      nz = q - p;
+      if ( nz != 0.5 ) {
+        if ( nz > 0.5 ) {
+          p += 1.0;
+          nz = q - p;
+        }
+        nz = PI/::tan(PI*nz);
+      } else {
+        nz = 0.0;
+      }
+      x = 1.0 - x;
+    }
+
+    /* check for positive integer up to 10 */
+    if ( (x <= 10.0) && (x == ::floor(x)) ) {
+      y = 0.0;
+      n = x;
+      for ( i = 1; i < n; i++ ) {
+        w = i;
+        y += 1.0/w;
+      }
+      y -= EUL;
+      goto done;
+    }
+
+    s = x;
+    w = 0.0;
+    while ( s < 10.0 ) {
+      w += 1.0/s;
+      s += 1.0;
+    }
+
+    y = psi_helper(s);
+
+    y = logf(s)  -  (0.5/s)  -  y  -  w;
+
+done:
+
+    if ( negative ) {
+      y -= nz;
+    }
+
+    return(y);
+  }
+};
+
+
+template<>
+__device__ inline double cephes::psi_helper<double>(double s) {
+  double z;
+  const double A[] = {
+    8.33333333333333333333E-2,
+    -2.10927960927960927961E-2,
+    7.57575757575757575758E-3,
+    -4.16666666666666666667E-3,
+    3.96825396825396825397E-3,
+    -8.33333333333333333333E-3,
+    8.33333333333333333333E-2
+  };
+
+  if ( s < 1.0e17 ) {
+    z = 1.0/(s * s);
+    return z * cephes::polevl<double>(z, A, 6);
+  } else {
+    return 0.0;
+  }
+}
+
+template<>
+__device__ inline float cephes::psi_helper<float>(float s) {
+  float z;
+  const float A[] = {
+    -4.16666666666666666667E-3f,
+    3.96825396825396825397E-3f,
+    -8.33333333333333333333E-3f,
+    8.33333333333333333333E-2f
+  };
+
+  if ( s < 1.0e8 ) {
+    z = 1.0/(s * s);
+    return z * cephes::polevl<float>(z, A, 3);
+  } else {
+    return 0.0;
+  }
+}
+}  // namespace special_functions
+}  // namespace op
+)code";
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_COMMON_CUDA_RTC_SPECIAL_FUNCTIONS_INL_H_
diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h
index e4c9ec9d5825..d2e2fae7c482 100644
--- a/src/operator/fusion/fused_op-inl.h
+++ b/src/operator/fusion/fused_op-inl.h
@@ -133,51 +133,49 @@ const std::map<std::string, std::vector<std::vector<std::string>>> ops_desc = {
   {"_hypot"                            , {{"op::hypot(%, %)", "_0", "_1"}}},
   {"_hypot_scalar"                     , {{"op::hypot(%, float(%))", "_0", "scalar"}}},
   {"logical_not"                       , {{"op::logical_not(%)", "_0"}}},
-  {"_backward_relu"                    , {{"op::backward_relu(%, %)", "_1", "_0"}}},
-  {"_backward_sigmoid"                 , {{"op::backward_sigmoid(%, %)", "_1", "_0"}}},
-  {"_backward_expm1"                   , {{"op::backward_expm1(%, %)", "_1", "_0"}}},
-  {"_backward_log"                     , {{"op::backward_log(%, %)", "_1", "_0"}}},
-  {"_backward_log10"                   , {{"op::backward_log10(%, %)", "_1", "_0"}}},
-  {"_backward_log2"                    , {{"op::backward_log2(%, %)", "_1", "_0"}}},
-  {"_backward_log1p"                   , {{"op::backward_log1p(%, %)", "_1", "_0"}}},
-  {"_backward_sin"                     , {{"op::backward_sin(%, %)", "_1", "_0"}}},
-  {"_backward_cos"                     , {{"op::backward_cos(%, %)", "_1", "_0"}}},
-  {"_backward_tan"                     , {{"op::backward_tan(%, %)", "_1", "_0"}}},
-  {"_backward_arcsin"                  , {{"op::backward_arcsin(%, %)", "_1", "_0"}}},
-  {"_backward_arccos"                  , {{"op::backward_arccos(%, %)", "_1", "_0"}}},
-  {"_backward_arctan"                  , {{"op::backward_arctan(%, %)", "_1", "_0"}}},
-  {"_backward_sinh"                    , {{"op::backward_sinh(%, %)", "_1", "_0"}}},
-  {"_backward_cosh"                    , {{"op::backward_cosh(%, %)", "_1", "_0"}}},
-  {"_backward_tanh"                    , {{"op::backward_tanh(%, %)", "_1", "_0"}}},
-  {"_backward_arcsinh"                 , {{"op::backward_arcsinh(%, %)", "_1", "_0"}}},
-  {"_backward_arccosh"                 , {{"op::backward_arccosh(%, %)", "_1", "_0"}}},
-  {"_backward_arctanh"                 , {{"op::backward_arctanh(%, %)", "_1", "_0"}}},
-  {"_backward_sqrt"                    , {{"op::backward_sqrt(%, %)", "_1", "_0"}}},
-  {"_backward_rsqrt"                   , {{"op::backward_rsqrt(%, %)", "_1", "_0"}}},
-  {"_backward_cbrt"                    , {{"op::backward_cbrt(%, %)", "_1", "_0"}}},
-  {"_backward_rcbrt"                   , {{"op::backward_rcbrt(%, %)", "_1", "_0"}}},
-  {"_backward_square"                  , {{"op::backward_square(%, %)", "_1", "_0"}}},
+  {"_backward_relu"                    , {{"op::backward_relu(%, %)", "_0", "_1"}}},
+  {"_backward_sigmoid"                 , {{"op::backward_sigmoid(%, %)", "_0", "_1"}}},
+  {"_backward_expm1"                   , {{"op::backward_expm1(%, %)", "_0", "_1"}}},
+  {"_backward_log"                     , {{"op::backward_log(%, %)", "_0", "_1"}}},
+  {"_backward_log10"                   , {{"op::backward_log10(%, %)", "_0", "_1"}}},
+  {"_backward_log2"                    , {{"op::backward_log2(%, %)", "_0", "_1"}}},
+  {"_backward_log1p"                   , {{"op::backward_log1p(%, %)", "_0", "_1"}}},
+  {"_backward_sin"                     , {{"op::backward_sin(%, %)", "_0", "_1"}}},
+  {"_backward_cos"                     , {{"op::backward_cos(%, %)", "_0", "_1"}}},
+  {"_backward_tan"                     , {{"op::backward_tan(%, %)", "_0", "_1"}}},
+  {"_backward_arcsin"                  , {{"op::backward_arcsin(%, %)", "_0", "_1"}}},
+  {"_backward_arccos"                  , {{"op::backward_arccos(%, %)", "_0", "_1"}}},
+  {"_backward_arctan"                  , {{"op::backward_arctan(%, %)", "_0", "_1"}}},
+  {"_backward_sinh"                    , {{"op::backward_sinh(%, %)", "_0", "_1"}}},
+  {"_backward_cosh"                    , {{"op::backward_cosh(%, %)", "_0", "_1"}}},
+  {"_backward_tanh"                    , {{"op::backward_tanh(%, %)", "_0", "_1"}}},
+  {"_backward_arcsinh"                 , {{"op::backward_arcsinh(%, %)", "_0", "_1"}}},
+  {"_backward_arccosh"                 , {{"op::backward_arccosh(%, %)", "_0", "_1"}}},
+  {"_backward_arctanh"                 , {{"op::backward_arctanh(%, %)", "_0", "_1"}}},
+  {"_backward_sqrt"                    , {{"op::backward_sqrt(%, %)", "_0", "_1"}}},
+  {"_backward_rsqrt"                   , {{"op::backward_rsqrt(%, %)", "_0", "_1"}}},
+  {"_backward_cbrt"                    , {{"op::backward_cbrt(%, %)", "_0", "_1"}}},
+  {"_backward_rcbrt"                   , {{"op::backward_rcbrt(%, %)", "_0", "_1"}}},
+  {"_backward_square"                  , {{"op::backward_square(%, %)", "_0", "_1"}}},
   {"_backward_div_scalar"              , {{"(% * 1.0f/float(%))", "_0", "scalar"}}},
   {"_backward_div_scalar"              , {{"(% * 1.0f/float(%))", "_0", "scalar"}}},
-  {"_backward_rdiv_scalar"             , {{"(op::rdiv_grad(%, %) * %", "_1",
+  {"_backward_rdiv_scalar"             , {{"op::rdiv_grad(%, %) * %", "_1",
                                            "scalar", "_0"}}},
   {"_backward_hypot_scalar"            , {{"(% * % / op::hypot(%, float(%)))",
                                            "_0", "_1", "_1", "scalar"}}},
   {"_backward_radians"                 , {{"op::radians(%)", "_0"}}},
-  {"_backward_erf"                     , {{"op::backward_erf(%, %)", "_1", "_0"}}},
-  {"_backward_erfinv"                  , {{"op::backward_erfinv(%, %)", "_1", "_0"}}},
-  {"_backward_reciprocal"              , {{"op::backward_reciprocal(%, %)", "_1", "_0"}}},
-  {"_backward_abs"                     , {{"(% * op::sign(%))", "_0", "_1"}}},
+  {"_backward_erf"                     , {{"op::backward_erf(%, %)", "_0", "_1"}}},
+  {"_backward_erfinv"                  , {{"op::backward_erfinv(%, %)", "_0", "_1"}}},
+  {"_backward_reciprocal"              , {{"op::backward_reciprocal(%, %)", "_0", "_1"}}},
+  {"_backward_abs"                     , {{"(op::backward_abs(%, %))", "_0", "_1"}}},
   {"_backward_degrees"                 , {{"op::degrees(%)", "_0"}}},
-  {"_backward_sign"                    , {{"op::zero(%)", "_0"}}},
-  {"_backward_clip"                    , {{"op::backward_clip(%, %, %, %)", "_1", "_0",
+  {"_backward_clip"                    , {{"op::backward_clip(%, %, %, %)", "_0", "_1",
                                                                             "a_min", "a_max"}}},
   {"smooth_l1"                         , {{"op::smooth_l1(%, float(%))", "_0", "scalar"}}},
   {"_backward_smooth_l1"               , {{"op::smooth_l1_grad(%, float(%)) * %",
                                            "_1", "scalar", "_0"}}},
   // TODO(ptredak): arange
   // TODO(ptredak): LeakyRelu
-  // TODO(ptredak): mod and rmod
   {"_backward_sub"                     , {{"(%)", "_0"},
                                           {"(-(%))", "_0"}}},
   {"_backward_mul"                     , {{"(% * %)", "_0", "_2"},
diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu
index 23d6b231630f..6b892ce68349 100644
--- a/src/operator/fusion/fused_op.cu
+++ b/src/operator/fusion/fused_op.cu
@@ -394,17 +394,17 @@ std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
         if (op_name == "_backward_Activation") {
           CHECK_EQ(outputs[i], 1);
           std::string act_type = node.source->attrs.dict.at("act_type");
-          std::string rhs, lhs;
-          rhs = variables[{node.inputs[0].node_id, node.inputs[0].index}];
+          std::string ograd, input;
+          ograd = variables[{node.inputs[0].node_id, node.inputs[0].index}];
           if (act_type == "relu" ||
               act_type == "sigmoid" ||
               act_type == "tanh") {
-            lhs = variables[{node.inputs[1].node_id, node.inputs[1].index}];
+            input = variables[{node.inputs[1].node_id, node.inputs[1].index}];
           } else {
-            lhs = variables[{node.inputs[2].node_id, node.inputs[2].index}];
+            input = variables[{node.inputs[2].node_id, node.inputs[2].index}];
           }
           code += "const auto " + var_name + " = op::backward_" + act_type +
-                  "(" + lhs + ", " + rhs + ");\n";
+                  "(" + ograd + ", " + input + ");\n";
 
           variables[{i, 0}] = var_name;
           continue;
diff --git a/src/operator/nn/group_norm-inl.h b/src/operator/nn/group_norm-inl.h
index 143e2168d113..deca04ee84a9 100644
--- a/src/operator/nn/group_norm-inl.h
+++ b/src/operator/nn/group_norm-inl.h
@@ -290,8 +290,13 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
     });
   }
   // Calculate grad_gamma, it will be sum(ograd * normalized_data, exclude_axis)
-  ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
-                                                      {kWriteTo}, {ograd_mult});
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
+                                                        {kWriteTo}, {ograd_mult});
+  } else {
+    ElemwiseBinaryRTCCompute{"mul"}(attrs, ctx, {normalized_data, ograd},
+                                    {kWriteTo}, {ograd_mult});
+  }
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
@@ -326,8 +331,13 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
     BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
                                                       {ograd_mult, red_out},
                                                       {req[0]}, {output_});
-    ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
-                                                        {kWriteTo}, {ograd_mult});
+    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+      ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
+                                                          {kWriteTo}, {ograd_mult});
+    } else {
+      ElemwiseBinaryRTCCompute{"mul"}(attrs, ctx, {ograd_mult, normalized_data},
+                                      {kWriteTo}, {ograd_mult});
+    }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
         broadcast::Reduce<mshadow_op::sum, NDim, DType, op::mshadow_op::identity, true>(
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index 238a71bbba1f..085324b3b871 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -276,8 +276,13 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
     });
   }
   // Calculate grad_gamma, it will be sum(ograd * normalized_data, exclude_axis)
-  ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
-                                                      {kWriteTo}, {ograd_mult});
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
+                                                        {kWriteTo}, {ograd_mult});
+  } else {
+    ElemwiseBinaryRTCCompute{"mul"}(attrs, ctx, {normalized_data, ograd},
+                                    {kWriteTo}, {ograd_mult});
+  }
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
@@ -322,8 +327,13 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
     BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
                                                       {ograd_mult, red_out},
                                                       {req[0]}, {outputs[0]});
-    ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
-                                                        {kWriteTo}, {ograd_mult});
+    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+      ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
+                                                          {kWriteTo}, {ograd_mult});
+    } else {
+      ElemwiseBinaryRTCCompute{"mul"}(attrs, ctx, {ograd_mult, normalized_data},
+                                      {kWriteTo}, {ograd_mult});
+    }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
         if (safe_acc) {
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cc b/src/operator/numpy/np_elemwise_unary_op_basic.cc
index 74b2b7ecd956..c4ab5468cb25 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cc
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cc
@@ -228,7 +228,7 @@ The sign function returns -1 if x < 0, 0 if x==0, 1 if x > 0.
 Example::
    sign([-2, 0, 3]) = [-1, 0, 1]
 )code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_sign"});
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // rint
 MXNET_OPERATOR_REGISTER_NUMPY_MIXED_TYPE_UNARY(_npi_rint, "x", mshadow_op::rint)
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 61e8959ac777..f34e8c51eb17 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -297,7 +297,6 @@ IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::reciprocal_cube_root_grad);
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::abs);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sign);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sign);  // NOLINT()
-IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sign_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::round);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::floor);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::trunc);  // NOLINT()
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
index ea2466259494..db9f6e022421 100644
--- a/src/operator/tensor/elemwise_binary_op.cc
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -25,6 +25,10 @@
 
 #include "./elemwise_binary_op.h"
 
+#if MXNET_USE_CUDA
+#include "../../common/cuda/rtc/vectorization-inl.h"
+#include "../../common/cuda/rtc.h"
+#endif  // MXNET_USE_CUDA
 
 namespace mxnet {
 namespace op {
@@ -92,5 +96,100 @@ bool ElemwiseBinaryOp::BackwardUseInStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
+#if MXNET_USE_CUDA
+
+struct binary_kernel_params {
+  const void *inputs[2];
+  void *outputs[1];
+};
+
+const char binary_kernel_fwd[] = R"code(
+
+struct binary_kernel_params {
+  const void *inputs[2];
+  void *outputs[1];
+};
+
+__global__ void binary_kernel(const binary_kernel_params params,
+                              const index_t lead_dim,
+                              const index_t other_dim,
+                              const index_t N,
+                              const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> loader0(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedLoader<InputType1, nvec, aligned> loader1(
+    reinterpret_cast<const InputType1*>(params.inputs[1]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using IType0 = AccType<InputType0>;
+  using IType1 = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader0.load(tid, N);
+    loader1.load(tid, N);
+    if (req == OpReqType::kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto input0 = IType0::from(loader0.separate()[i]);
+      const auto input1 = IType1::from(loader1.separate()[i]);
+      const auto temp = OP(input0, input1);  // enables returning different type
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+)code";
+
+void ElemwiseBinaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n" +
+                           "#define OP op::" +
+                           OP +
+                           "\n" +
+                           binary_kernel_fwd;
+  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+  const index_t size = outputs[0].Size();
+  binary_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_},
+                                  {outputs[0].dptr_} };
+
+  VectorizedKernelRTCLauncher(code, "binary_kernel", nvec,
+                              size, 1, s, params,
+                              inputs, outputs,
+                              ctx.run_ctx.get_ctx().dev_id);
+}
+
+
+#endif  // MXNET_USE_CUDA
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh
index e9fcd2a09798..88612f1546a0 100644
--- a/src/operator/tensor/elemwise_binary_op.cuh
+++ b/src/operator/tensor/elemwise_binary_op.cuh
@@ -29,7 +29,6 @@
 #include <cuda_runtime.h>
 #include "../operator_common.h"
 #include "../../common/cuda/vectorization.cuh"
-#include "../../common/cuda/rtc.h"
 
 #include <vector>
 
@@ -50,38 +49,6 @@ struct VectorizedBinaryKernelParams {
   DType* outputs[NumOutputs];
 };
 
-template <bool aligned, typename DType, typename LType, typename OP, int req>
-__global__ void VectorizedBinaryKernelFwd(const VectorizedBinaryKernelParams<DType, 2, 1> params,
-                                          const index_t N) {
-  VectorizedLoader<DType, LType, aligned> loader0(params.inputs[0], N);
-  VectorizedLoader<DType, LType, aligned> loader1(params.inputs[1], N);
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
-
-  const index_t M = loader0.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader0.load(tid, N);
-    loader1.load(tid, N);
-    if (req == kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < loader0.nvec(); ++i) {
-      DType temp = OP::Map(loader0.separate()[i],
-                           loader1.separate()[i]);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
 template <bool aligned, typename DType, typename LType,
           typename LOP, typename ROP, int lreq, int rreq>
 __global__ void VectorizedBinaryKernelBwdUseNone(
@@ -184,21 +151,6 @@ __global__ void VectorizedBinaryKernelBwdUseIn(
   }
 }
 
-template <typename DType, typename OP, int req>
-class VectorizedBinaryFwd {
- public:
-  using ParamType = VectorizedBinaryKernelParams<DType, 2, 1>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryKernelFwd<aligned, DType, LType, OP, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
 template <typename DType, typename LOP, typename ROP, int lreq, int rreq>
 class VectorizedBinaryBwdUseNone {
  public:
@@ -231,126 +183,6 @@ class VectorizedBinaryBwdUseIn {
 
 }  // namespace binary
 
-template<typename OP>
-void ElemwiseBinaryOp::Compute_(const nnvm::NodeAttrs &attrs,
-                                mshadow::Stream<gpu> *s,
-                                const std::vector<TBlob> &inputs,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<TBlob> &outputs) {
-  using namespace binary;
-  if (req[0] == kNullOp) return;
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint4;
-      using Kernel = VectorizedBinaryFwd<DType, OP, Req>;
-
-      const index_t size = outputs[0].Size();
-      typename Kernel::ParamType params;
-      params.inputs[0] = inputs[0].dptr<DType>();
-      params.inputs[1] = inputs[1].dptr<DType>();
-      params.outputs[0] = outputs[0].dptr<DType>();
-
-      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-    });
-  });
-}
-
-struct binary_kernel_params {
-  const void *inputs[2];
-  void *outputs[1];
-};
-
-const char binary_kernel_fwd[] = R"code(
-
-struct binary_kernel_params {
-  const void *inputs[2];
-  void *outputs[1];
-};
-
-__global__ void binary_kernel(const binary_kernel_params params,
-                              const index_t lead_dim,
-                              const index_t other_dim,
-                              const index_t N,
-                              const index_t num_aligned_elements) {
-  using namespace vector;
-  VectorizedLoader<InputType0, nvec, aligned> loader0(
-    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
-  VectorizedLoader<InputType1, nvec, aligned> loader1(
-    reinterpret_cast<const InputType1*>(params.inputs[1]), N);
-  VectorizedStorer<OutputType0, nvec, aligned> storer(
-    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
-
-  using IType0 = AccType<InputType0>;
-  using IType1 = AccType<InputType0>;
-  using OType = AccType<OutputType0>;
-
-  const index_t M = num_aligned_elements;
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader0.load(tid, N);
-    loader1.load(tid, N);
-    if (req == OpReqType::kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < nvec; ++i) {
-      const auto input0 = IType0::from(loader0.separate()[i]);
-      const auto input1 = IType1::from(loader1.separate()[i]);
-      const auto temp = OP(input0, input1);  // enables returning different type
-
-      if (req == OpReqType::kAddTo) {
-        // temp2 may have a wider type than either temp
-        // or OType
-        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
-        storer.separate()[i] = OType::to(temp2);
-      } else {
-        storer.separate()[i] = OType::to(temp);
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-)code";
-
-struct ElemwiseBinaryRTCCompute {
-  std::string OP;
-
-  void operator()(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
-    using namespace mxnet::common::cuda::rtc;
-    if (req[0] == kNullOp) return;
-    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
-    CHECK_EQ(inputs.size(), 2U);
-    CHECK_EQ(outputs.size(), 1U);
-
-    const std::string code = std::string("const OpReqType req = ") +
-                             util::to_string(req[0]) +
-                             ";\n" +
-                             "#define OP op::" +
-                             OP +
-                             "\n" +
-                             binary_kernel_fwd;
-    const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
-
-    const index_t size = outputs[0].Size();
-    binary_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_},
-                                    {outputs[0].dptr_} };
-
-    VectorizedKernelRTCLauncher(code, "binary_kernel", nvec,
-                                size, 1, s, params,
-                                inputs, outputs,
-                                ctx.run_ctx.get_ctx().dev_id);
-  }
-};
-
 template<typename LOP, typename ROP>
 void ElemwiseBinaryOp::BackwardUseNone_(const nnvm::NodeAttrs &attrs,
                                         mshadow::Stream<gpu>* s,
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 158c46122d31..8c2a2cf92c08 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -38,6 +38,7 @@
 #include "elemwise_unary_op.h"
 #include "../../common/utils.h"
 #include "./init_op.h"
+#include "../operator_common.h"
 
 namespace mxnet {
 namespace op {
@@ -494,13 +495,15 @@ class ElemwiseBinaryOp : public OpBase {
     });
   }
 
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs &attrs,
-                       mshadow::Stream<cpu> *s,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs) {
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs &attrs,
+                      const OpContext &ctx,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &outputs) {
     using namespace mxnet_op;
+    if (req[0] == kNullOp) return;
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     CHECK_EQ(inputs.size(), 2U);
     CHECK_EQ(outputs.size(), 1U);
     if (outputs[0].type_flag_ == mshadow::kBool) {
@@ -511,7 +514,7 @@ class ElemwiseBinaryOp : public OpBase {
         const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
         + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
         if (size != 0) {
-          Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(s, size,
+          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
           outputs[0].dptr<DType>(),
           inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
         }
@@ -577,26 +580,6 @@ template<typename xpu, typename OP>
     });
   }
 
-#if MXNET_USE_CUDA
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs &attrs,
-                       mshadow::Stream<gpu> *s,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs);
-#endif
-
-  template<typename xpu, typename OP>
-  static void Compute(const nnvm::NodeAttrs &attrs,
-                      const OpContext &ctx,
-                      const std::vector<TBlob> &inputs,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<TBlob> &outputs) {
-    if (req[0] == kNullOp) return;
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    Compute_<OP>(attrs, s, inputs, req, outputs);
-  }
-
   template<typename xpu, typename OP>
   static void ComputeWithBool(const nnvm::NodeAttrs &attrs,
                               const OpContext &ctx,
@@ -882,6 +865,20 @@ template<typename xpu, typename OP>
     [](const NodeAttrs& attrs) { \
       return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};})
 
+#if MXNET_USE_CUDA
+
+struct ElemwiseBinaryRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
+#endif
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index 2c49865881ff..927a833a5ac8 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -26,6 +26,7 @@
 #include "./elemwise_binary_op.h"
 #include "./elemwise_binary_op-inl.h"
 #include "./indexing_op.h"
+#include "../../common/cuda/rtc.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
new file mode 100644
index 000000000000..82a2775c13e4
--- /dev/null
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "elemwise_binary_scalar_op.h"
+
+#if MXNET_USE_CUDA
+#include "../../common/cuda/rtc/vectorization-inl.h"
+#include "../../common/cuda/rtc.h"
+#endif  // MXNET_USE_CUDA
+
+namespace mxnet {
+namespace op {
+
+#if MXNET_USE_CUDA
+
+struct binary_scalar_kernel_params {
+  const void *inputs[2];
+  void *outputs[1];
+  double scalar;
+};
+
+const char binary_scalar_kernel_fwd[] = R"code(
+
+struct binary_scalar_kernel_params {
+  const void *inputs[2];
+  void *outputs[1];
+  double scalar;
+};
+
+__global__ void binary_scalar_kernel(const binary_scalar_kernel_params params,
+                                     const index_t lead_dim,
+                                     const index_t other_dim,
+                                     const index_t N,
+                                     const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using IType = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader.load(tid, N);
+    if (req == OpReqType::kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto input = IType::from(loader.separate()[i]);
+      // enables returning different type
+      const auto temp = OP(input, static_cast<typename IType::type>(params.scalar));
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+)code";
+
+void BinaryScalarRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<TBlob>& inputs,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
+
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n" +
+                           "#define OP op::" +
+                           OP +
+                           "\n" +
+                           binary_scalar_kernel_fwd;
+  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+  const index_t size = outputs[0].Size();
+  binary_scalar_kernel_params params = { {inputs[0].dptr_, nullptr},
+                                         {outputs[0].dptr_},
+                                         alpha };
+
+  VectorizedKernelRTCLauncher(code, "binary_scalar_kernel", nvec,
+                              size, 1, s, params,
+                              inputs, outputs,
+                              ctx.run_ctx.get_ctx().dev_id);
+}
+
+void BinaryScalarRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& inputs,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) {
+    return;
+  }
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  InitStorageGeometry<1, 1>(attrs, inputs, outputs);
+  CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+    << "This function works only for sparse types.";
+  CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type())
+    << "The storage type of both inputs and outputs needs to be the same.";
+  AllocateGeometry(&outputs[0], req[0], &inputs[0]);
+  CopyGeometryBlobs<gpu>(ctx.get_stream<gpu>(), &outputs[0], req[0], inputs[0]);
+  outputs[0].CheckAndAllocData(inputs[0].storage_shape());
+  if (inputs[0].storage_shape().Size()) {
+    std::vector<TBlob> in_blobs, out_blobs;
+    in_blobs.reserve(inputs.size());
+    out_blobs.reserve(outputs.size());
+    for (auto &input : inputs) {
+      in_blobs.emplace_back(input.data());
+    }
+    for (auto &output : outputs) {
+      out_blobs.emplace_back(output.data());
+    }
+    this->operator()(attrs, ctx, in_blobs, req, out_blobs);
+  }
+}
+
+const char binary_scalar_kernel_bwd[] = R"code(
+
+struct binary_scalar_kernel_params {
+  const void *inputs[2];
+  void *outputs[1];
+  double scalar;
+};
+
+__global__ void binary_scalar_kernel_bwd(const binary_scalar_kernel_params params,
+                                         const index_t lead_dim,
+                                         const index_t other_dim,
+                                         const index_t N,
+                                         const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> ograd_loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedLoader<InputType1, nvec, aligned> input_loader(
+    reinterpret_cast<const InputType1*>(params.inputs[1]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using GType = AccType<InputType0>;
+  using IType = AccType<InputType1>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    ograd_loader.load(tid, N);
+    input_loader.load(tid, N);
+    if (req == OpReqType::kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto ograd = GType::from(ograd_loader.separate()[i]);
+      const auto input = IType::from(input_loader.separate()[i]);
+      // enables returning different type
+      const auto temp = op::mul(ograd,
+                                OP(input, static_cast<typename IType::type>(params.scalar)));
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+)code";
+
+void BinaryScalarRTCBackward::operator()(const nnvm::NodeAttrs& attrs,
+                                         const OpContext& ctx,
+                                         const std::vector<TBlob>& inputs,
+                                         const std::vector<OpReqType>& req,
+                                         const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
+
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n" +
+                           "#define OP op::" +
+                           OP +
+                           "\n" +
+                           binary_scalar_kernel_bwd;
+  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+  const index_t size = outputs[0].Size();
+  binary_scalar_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_},
+                                         {outputs[0].dptr_},
+                                         alpha };
+
+  VectorizedKernelRTCLauncher(code, "binary_scalar_kernel_bwd", nvec,
+                              size, 1, s, params,
+                              inputs, outputs,
+                              ctx.run_ctx.get_ctx().dev_id);
+}
+
+#endif  // MXNET_USE_CUDA
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh
deleted file mode 100644
index 886909693795..000000000000
--- a/src/operator/tensor/elemwise_binary_scalar_op.cuh
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file elemwise_binary_scalar_op.cuh
- * \brief GPU helpers for binary elementwise operators with scalar
- */
-
-#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
-#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
-
-#include <cuda_runtime.h>
-#include "../operator_common.h"
-#include "elemwise_unary_op.h"
-
-#include <vector>
-
-#if MXNET_USE_CUDA
-
-namespace mxnet {
-namespace op {
-
-struct binary_scalar_kernel_params {
-  const void *inputs[2];
-  void *outputs[1];
-  double scalar;
-};
-
-const char binary_scalar_kernel_fwd[] = R"code(
-
-struct binary_scalar_kernel_params {
-  const void *inputs[2];
-  void *outputs[1];
-  double scalar;
-};
-
-__global__ void binary_scalar_kernel(const binary_scalar_kernel_params params,
-                                     const index_t lead_dim,
-                                     const index_t other_dim,
-                                     const index_t N,
-                                     const index_t num_aligned_elements) {
-  using namespace vector;
-  VectorizedLoader<InputType0, nvec, aligned> loader(
-    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
-  VectorizedStorer<OutputType0, nvec, aligned> storer(
-    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
-
-  using IType = AccType<InputType0>;
-  using OType = AccType<OutputType0>;
-
-  const index_t M = num_aligned_elements;
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader.load(tid, N);
-    if (req == OpReqType::kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < nvec; ++i) {
-      const auto input = IType::from(loader.separate()[i]);
-      // enables returning different type
-      const auto temp = OP(input, static_cast<typename IType::type>(params.scalar));
-
-      if (req == OpReqType::kAddTo) {
-        // temp2 may have a wider type than either temp
-        // or OType
-        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
-        storer.separate()[i] = OType::to(temp2);
-      } else {
-        storer.separate()[i] = OType::to(temp);
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-)code";
-
-struct BinaryScalarRTCCompute {
-  std::string OP;
-
-  void operator()(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
-    using namespace mxnet::common::cuda::rtc;
-    if (req[0] == kNullOp) return;
-    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
-    CHECK_EQ(inputs.size(), 1U);
-    CHECK_EQ(outputs.size(), 1U);
-    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
-    const double alpha = param.scalar;
-
-    const std::string code = std::string("const OpReqType req = ") +
-                             util::to_string(req[0]) +
-                             ";\n" +
-                             "#define OP op::" +
-                             OP +
-                             "\n" +
-                             binary_scalar_kernel_fwd;
-    const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
-
-    const index_t size = outputs[0].Size();
-    binary_scalar_kernel_params params = { {inputs[0].dptr_, nullptr},
-                                           {outputs[0].dptr_},
-                                           alpha };
-
-    VectorizedKernelRTCLauncher(code, "binary_scalar_kernel", nvec,
-                                size, 1, s, params,
-                                inputs, outputs,
-                                ctx.run_ctx.get_ctx().dev_id);
-  }
-
-  void operator()(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<NDArray>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<NDArray>& outputs) {
-    if (req[0] == kNullOp) {
-      return;
-    }
-    CHECK_EQ(inputs.size(), 1U);
-    CHECK_EQ(outputs.size(), 1U);
-    InitStorageGeometry<1, 1>(attrs, inputs, outputs);
-    CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
-      << "This function works only for sparse types.";
-    CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type())
-      << "The storage type of both inputs and outputs needs to be the same.";
-    AllocateGeometry(&outputs[0], req[0], &inputs[0]);
-    CopyGeometryBlobs<gpu>(ctx.get_stream<gpu>(), &outputs[0], req[0], inputs[0]);
-    outputs[0].CheckAndAllocData(inputs[0].storage_shape());
-    if (inputs[0].storage_shape().Size()) {
-      std::vector<TBlob> in_blobs, out_blobs;
-      in_blobs.reserve(inputs.size());
-      out_blobs.reserve(outputs.size());
-      for (auto &input : inputs) {
-        in_blobs.emplace_back(input.data());
-      }
-      for (auto &output : outputs) {
-        out_blobs.emplace_back(output.data());
-      }
-      this->operator()(attrs, ctx, in_blobs, req, out_blobs);
-    }
-  }
-};
-
-const char binary_scalar_kernel_bwd[] = R"code(
-
-struct binary_scalar_kernel_params {
-  const void *inputs[2];
-  void *outputs[1];
-  double scalar;
-};
-
-__global__ void binary_scalar_kernel_bwd(const binary_scalar_kernel_params params,
-                                         const index_t lead_dim,
-                                         const index_t other_dim,
-                                         const index_t N,
-                                         const index_t num_aligned_elements) {
-  using namespace vector;
-  VectorizedLoader<InputType0, nvec, aligned> ograd_loader(
-    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
-  VectorizedLoader<InputType1, nvec, aligned> input_loader(
-    reinterpret_cast<const InputType1*>(params.inputs[1]), N);
-  VectorizedStorer<OutputType0, nvec, aligned> storer(
-    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
-
-  using GType = AccType<InputType0>;
-  using IType = AccType<InputType1>;
-  using OType = AccType<OutputType0>;
-
-  const index_t M = num_aligned_elements;
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    ograd_loader.load(tid, N);
-    input_loader.load(tid, N);
-    if (req == OpReqType::kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < nvec; ++i) {
-      const auto ograd = GType::from(ograd_loader.separate()[i]);
-      const auto input = IType::from(input_loader.separate()[i]);
-      // enables returning different type
-      const auto temp = op::mul(ograd,
-                                OP(input, static_cast<typename IType::type>(params.scalar)));
-
-      if (req == OpReqType::kAddTo) {
-        // temp2 may have a wider type than either temp
-        // or OType
-        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
-        storer.separate()[i] = OType::to(temp2);
-      } else {
-        storer.separate()[i] = OType::to(temp);
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-)code";
-
-struct BinaryScalarRTCBackward {
-  std::string OP;
-
-  void operator()(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
-    using namespace mxnet::common::cuda::rtc;
-    if (req[0] == kNullOp) return;
-    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
-    CHECK_EQ(inputs.size(), 2U);
-    CHECK_EQ(outputs.size(), 1U);
-    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
-    const double alpha = param.scalar;
-
-    const std::string code = std::string("const OpReqType req = ") +
-                             util::to_string(req[0]) +
-                             ";\n" +
-                             "#define OP op::" +
-                             OP +
-                             "\n" +
-                             binary_scalar_kernel_bwd;
-    const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
-
-    const index_t size = outputs[0].Size();
-    binary_scalar_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_},
-                                           {outputs[0].dptr_},
-                                           alpha };
-
-    VectorizedKernelRTCLauncher(code, "binary_scalar_kernel_bwd", nvec,
-                                size, 1, s, params,
-                                inputs, outputs,
-                                ctx.run_ctx.get_ctx().dev_id);
-  }
-};
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA
-#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 5e95df7aa9db..a6fdf1e7572d 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -463,11 +463,38 @@ class BinaryScalarOp : public UnaryOp {
   .add_argument("data", "NDArray-or-Symbol", "source input")              \
   .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
+#if MXNET_USE_CUDA
+
+struct BinaryScalarRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<NDArray>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<NDArray>& outputs);
+};
+
+struct BinaryScalarRTCBackward {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
+#endif
+
 }  // namespace op
 }  // namespace mxnet
 
-#ifdef __CUDACC__
-#include "elemwise_binary_scalar_op.cuh"
-#endif
 
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_H_
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
index 84f188d3d44f..c662dc2d2923 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
@@ -38,7 +38,7 @@ NNVM_REGISTER_OP(_minimum_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"min"});
 
 NNVM_REGISTER_OP(_backward_minimum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"lesser_equal"});
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"less_equal"});
 
 NNVM_REGISTER_OP(_power_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"power"});
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
new file mode 100644
index 000000000000..683a03d15031
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "elemwise_unary_op.h"
+
+#if MXNET_USE_CUDA
+#include "../../common/cuda/rtc/vectorization-inl.h"
+#include "../../common/cuda/rtc.h"
+#endif  // MXNET_USE_CUDA
+
+namespace mxnet {
+namespace op {
+
+#if MXNET_USE_CUDA
+
+struct unary_kernel_params {
+  const void *inputs[1];
+  void *outputs[1];
+};
+
+const char unary_kernel_fwd[] = R"code(
+
+struct unary_kernel_params {
+  const void *inputs[1];
+  void *outputs[1];
+};
+
+__global__ void unary_kernel(const unary_kernel_params params,
+                             const index_t lead_dim,
+                             const index_t other_dim,
+                             const index_t N,
+                             const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using IType = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader.load(tid, N);
+    if (req == OpReqType::kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto input = IType::from(loader.separate()[i]);
+      const auto temp = OP(input);  // enables returning different type
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+)code";
+
+void UnaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                                 const OpContext& ctx,
+                                 const std::vector<TBlob>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n" +
+                           "#define OP op::" +
+                           OP +
+                           "\n" +
+                           unary_kernel_fwd;
+  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+  const index_t size = outputs[0].Size();
+  unary_kernel_params params = { {inputs[0].dptr_},
+                                 {outputs[0].dptr_} };
+
+  VectorizedKernelRTCLauncher(code, "unary_kernel", nvec,
+                              size, 1, s, params,
+                              inputs, outputs,
+                              ctx.run_ctx.get_ctx().dev_id);
+}
+
+void UnaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                                 const OpContext& ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs) {
+  if (req[0] == kNullOp) {
+    return;
+  }
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  InitStorageGeometry<1, 1>(attrs, inputs, outputs);
+  CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+    << "This function works only for sparse types.";
+  CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type())
+    << "The storage type of both inputs and outputs needs to be the same.";
+  AllocateGeometry(&outputs[0], req[0], &inputs[0]);
+  CopyGeometryBlobs<gpu>(ctx.get_stream<gpu>(), &outputs[0], req[0], inputs[0]);
+  outputs[0].CheckAndAllocData(inputs[0].storage_shape());
+  if (inputs[0].storage_shape().Size()) {
+    std::vector<TBlob> in_blobs, out_blobs;
+    in_blobs.reserve(inputs.size());
+    out_blobs.reserve(outputs.size());
+    for (auto &input : inputs) {
+      in_blobs.emplace_back(input.data());
+    }
+    for (auto &output : outputs) {
+      out_blobs.emplace_back(output.data());
+    }
+    this->operator()(attrs, ctx, in_blobs, req, out_blobs);
+  }
+}
+
+#endif  // MXNET_USE_CUDA
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh
deleted file mode 100644
index 68dbbbf5d64f..000000000000
--- a/src/operator/tensor/elemwise_unary_op.cuh
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file elemwise_unary_op.cuh
- * \brief GPU helpers for unary elementwise operators
- */
-
-#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
-#define MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
-
-#include <cuda_runtime.h>
-#include "../operator_common.h"
-#include "../../common/cuda/rtc/vectorization-inl.h"
-#include "../../common/cuda/rtc.h"
-
-#include <vector>
-
-#if MXNET_USE_CUDA
-
-namespace mxnet {
-namespace op {
-
-struct unary_kernel_params {
-  const void *inputs[1];
-  void *outputs[1];
-};
-
-const char unary_kernel_fwd[] = R"code(
-
-struct unary_kernel_params {
-  const void *inputs[1];
-  void *outputs[1];
-};
-
-__global__ void unary_kernel(const unary_kernel_params params,
-                             const index_t lead_dim,
-                             const index_t other_dim,
-                             const index_t N,
-                             const index_t num_aligned_elements) {
-  using namespace vector;
-  VectorizedLoader<InputType0, nvec, aligned> loader(
-    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
-  VectorizedStorer<OutputType0, nvec, aligned> storer(
-    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
-
-  using IType = AccType<InputType0>;
-  using OType = AccType<OutputType0>;
-
-  const index_t M = num_aligned_elements;
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader.load(tid, N);
-    if (req == OpReqType::kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < nvec; ++i) {
-      const auto input = IType::from(loader.separate()[i]);
-      const auto temp = OP(input);  // enables returning different type
-
-      if (req == OpReqType::kAddTo) {
-        // temp2 may have a wider type than either temp
-        // or OType
-        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
-        storer.separate()[i] = OType::to(temp2);
-      } else {
-        storer.separate()[i] = OType::to(temp);
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-)code";
-
-struct UnaryRTCCompute {
-  std::string OP;
-
-  void operator()(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
-    using namespace mxnet::common::cuda::rtc;
-    if (req[0] == kNullOp) return;
-    mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
-    CHECK_EQ(inputs.size(), 1U);
-    CHECK_EQ(outputs.size(), 1U);
-
-    const std::string code = std::string("const OpReqType req = ") +
-                             util::to_string(req[0]) +
-                             ";\n" +
-                             "#define OP op::" +
-                             OP +
-                             "\n" +
-                             unary_kernel_fwd;
-    const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
-
-    const index_t size = outputs[0].Size();
-    unary_kernel_params params = { {inputs[0].dptr_},
-                                   {outputs[0].dptr_} };
-
-    VectorizedKernelRTCLauncher(code, "unary_kernel", nvec,
-                                size, 1, s, params,
-                                inputs, outputs,
-                                ctx.run_ctx.get_ctx().dev_id);
-  }
-
-  void operator()(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<NDArray>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<NDArray>& outputs) {
-    if (req[0] == kNullOp) {
-      return;
-    }
-    CHECK_EQ(inputs.size(), 1U);
-    CHECK_EQ(outputs.size(), 1U);
-    InitStorageGeometry<1, 1>(attrs, inputs, outputs);
-    CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
-      << "This function works only for sparse types.";
-    CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type())
-      << "The storage type of both inputs and outputs needs to be the same.";
-    AllocateGeometry(&outputs[0], req[0], &inputs[0]);
-    CopyGeometryBlobs<gpu>(ctx.get_stream<gpu>(), &outputs[0], req[0], inputs[0]);
-    outputs[0].CheckAndAllocData(inputs[0].storage_shape());
-    if (inputs[0].storage_shape().Size()) {
-      std::vector<TBlob> in_blobs, out_blobs;
-      in_blobs.reserve(inputs.size());
-      out_blobs.reserve(outputs.size());
-      for (auto &input : inputs) {
-        in_blobs.emplace_back(input.data());
-      }
-      for (auto &output : outputs) {
-        out_blobs.emplace_back(output.data());
-      }
-      this->operator()(attrs, ctx, in_blobs, req, out_blobs);
-    }
-  }
-
-};
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA
-#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index d9634c252eb1..46595d00388e 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -913,11 +913,28 @@ void NumpyNanToNumOpBackward(const nnvm::NodeAttrs& attrs,
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                \
   .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::Compute<__xpu$, __kernel$>)
 
-}  // namespace op
-}  // namespace mxnet
+#if MXNET_USE_CUDA
+
+struct UnaryRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<NDArray>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<NDArray>& outputs);
+
+};
 
-#ifdef __CUDACC__
-#include "elemwise_unary_op.cuh"
 #endif
 
+}  // namespace op
+}  // namespace mxnet
+
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index c00aea7e8af4..de8044368157 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -757,9 +757,7 @@ The storage type of ``sign`` output depends upon the input storage type:
    - sign(csr) = csr
 
 )code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_sign"});
-
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sign, unary_bwd<mshadow_op::sign_grad>);
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // round
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(round, cpu, mshadow_op::round)
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index b7c487d8b770..e241623c60cd 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -31,15 +31,13 @@ NNVM_REGISTER_OP(relu)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"relu"});
 
 NNVM_REGISTER_OP(_backward_relu)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::relu_grad>>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_relu"});
 
 NNVM_REGISTER_OP(sigmoid)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sigmoid"});
 
 NNVM_REGISTER_OP(_backward_sigmoid)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::sigmoid_grad>>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_sigmoid"});
 
 NNVM_REGISTER_OP(hard_sigmoid)
 .set_attr<FCompute>("FCompute<gpu>", HardSigmoidForward<gpu>);
@@ -52,8 +50,7 @@ NNVM_REGISTER_OP(softsign)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"softsign"});
 
 NNVM_REGISTER_OP(_backward_softsign)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::softsign_grad>>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_softsign"});
 
 // erf
 NNVM_REGISTER_OP(erf)
@@ -61,7 +58,7 @@ NNVM_REGISTER_OP(erf)
 
 NNVM_REGISTER_OP(_backward_erf)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::erf_grad>>);
+                    ElemwiseBinaryRTCCompute{"backward_erf"});
 
 // erfinv
 NNVM_REGISTER_OP(erfinv)
@@ -69,7 +66,7 @@ NNVM_REGISTER_OP(erfinv)
 
 NNVM_REGISTER_OP(_backward_erfinv)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::erfinv_grad>>);
+                    ElemwiseBinaryRTCCompute{"backward_erfinv"});
 
 // copy
 NNVM_REGISTER_OP(_copy)
@@ -161,17 +158,13 @@ NNVM_REGISTER_OP(abs)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"abs"});
 
 NNVM_REGISTER_OP(_backward_abs)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::sign> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_abs"});
 
 // sign
 NNVM_REGISTER_OP(sign)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sign"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sign"});
 
-NNVM_REGISTER_OP(_backward_sign)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::sign_grad> >);
-
 // round
 NNVM_REGISTER_OP(round)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"round"})
@@ -207,16 +200,14 @@ NNVM_REGISTER_OP(gamma)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"gamma"});
 
 NNVM_REGISTER_OP(_backward_gamma)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::gamma_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_gamma"});
 
 // gammaln
 NNVM_REGISTER_OP(gammaln)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"gammaln"});
 
 NNVM_REGISTER_OP(_backward_gammaln)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::gammaln_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_gammaln"});
 
 // digamma
 NNVM_REGISTER_OP(digamma)
diff --git a/src/operator/tensor/elemwise_unary_op_logexp.cu b/src/operator/tensor/elemwise_unary_op_logexp.cu
index 42189497af88..e0f0d69cac11 100644
--- a/src/operator/tensor/elemwise_unary_op_logexp.cu
+++ b/src/operator/tensor/elemwise_unary_op_logexp.cu
@@ -43,16 +43,13 @@ NNVM_REGISTER_OP(log2)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"log2"});
 
 NNVM_REGISTER_OP(_backward_log)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::log_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_log"});
 
 NNVM_REGISTER_OP(_backward_log10)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::log10_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_log10"});
 
 NNVM_REGISTER_OP(_backward_log2)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::log2_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_log2"});
 
 // log1p
 NNVM_REGISTER_OP(log1p)
@@ -60,8 +57,7 @@ NNVM_REGISTER_OP(log1p)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"log1p"});
 
 NNVM_REGISTER_OP(_backward_log1p)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::log1p_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_log1p"});
 
 // expm1
 NNVM_REGISTER_OP(expm1)
@@ -69,8 +65,7 @@ NNVM_REGISTER_OP(expm1)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"expm1"});
 
 NNVM_REGISTER_OP(_backward_expm1)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::exp> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_expm1"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_pow.cu b/src/operator/tensor/elemwise_unary_op_pow.cu
index ed555b153261..c05627724738 100644
--- a/src/operator/tensor/elemwise_unary_op_pow.cu
+++ b/src/operator/tensor/elemwise_unary_op_pow.cu
@@ -32,8 +32,7 @@ NNVM_REGISTER_OP(reciprocal)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"reciprocal"});
 
 NNVM_REGISTER_OP(_backward_reciprocal)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::reciprocal_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_reciprocal"});
 
 // square
 NNVM_REGISTER_OP(square)
@@ -41,8 +40,7 @@ NNVM_REGISTER_OP(square)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"square"});
 
 NNVM_REGISTER_OP(_backward_square)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::square_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_square"});
 
 // sqrt
 NNVM_REGISTER_OP(sqrt)
@@ -50,8 +48,7 @@ NNVM_REGISTER_OP(sqrt)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sqrt"});
 
 NNVM_REGISTER_OP(_backward_sqrt)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::square_root_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_sqrt"});
 
 // rsqrt
 NNVM_REGISTER_OP(rsqrt)
@@ -59,7 +56,7 @@ NNVM_REGISTER_OP(rsqrt)
 
 NNVM_REGISTER_OP(_backward_rsqrt)
 .set_attr<FCompute>("FCompute<gpu>",
-  ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::reciprocal_square_root_grad> >);
+  ElemwiseBinaryRTCCompute{"backward_rsqrt"});
 
 // cbrt
 NNVM_REGISTER_OP(cbrt)
@@ -68,8 +65,7 @@ NNVM_REGISTER_OP(cbrt)
 
 
 NNVM_REGISTER_OP(_backward_cbrt)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::cube_root_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_cbrt"});
 
 // rcbrt
 NNVM_REGISTER_OP(rcbrt)
@@ -77,7 +73,7 @@ NNVM_REGISTER_OP(rcbrt)
 
 NNVM_REGISTER_OP(_backward_rcbrt)
 .set_attr<FCompute>("FCompute<gpu>",
-  ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::reciprocal_cube_root_grad> >);
+  ElemwiseBinaryRTCCompute{"backward_rcbrt"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_trig.cu b/src/operator/tensor/elemwise_unary_op_trig.cu
index 7407252ac53a..8adf6b6ea260 100644
--- a/src/operator/tensor/elemwise_unary_op_trig.cu
+++ b/src/operator/tensor/elemwise_unary_op_trig.cu
@@ -33,16 +33,14 @@ NNVM_REGISTER_OP(sin)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sin"});
 
 NNVM_REGISTER_OP(_backward_sin)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::sin_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_sin"});
 
 // cos
 NNVM_REGISTER_OP(cos)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"cos"});
 
 NNVM_REGISTER_OP(_backward_cos)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::cos_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_cos"});
 
 // tan
 NNVM_REGISTER_OP(tan)
@@ -50,8 +48,7 @@ NNVM_REGISTER_OP(tan)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"tan"});
 
 NNVM_REGISTER_OP(_backward_tan)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::tan_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_tan"});
 
 // arcsin
 NNVM_REGISTER_OP(arcsin)
@@ -59,16 +56,14 @@ NNVM_REGISTER_OP(arcsin)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"arcsin"});
 
 NNVM_REGISTER_OP(_backward_arcsin)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::arcsin_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arcsin"});
 
 // arccos
 NNVM_REGISTER_OP(arccos)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"arccos"});
 
 NNVM_REGISTER_OP(_backward_arccos)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::arccos_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arccos"});
 
 // arctan
 NNVM_REGISTER_OP(arctan)
@@ -76,8 +71,7 @@ NNVM_REGISTER_OP(arctan)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"arctan"});
 
 NNVM_REGISTER_OP(_backward_arctan)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::arctan_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arctan"});
 
 // degrees
 NNVM_REGISTER_OP(degrees)
@@ -85,8 +79,7 @@ NNVM_REGISTER_OP(degrees)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"degrees"});
 
 NNVM_REGISTER_OP(_backward_degrees)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::degrees_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_degrees"});
 
 // radians
 NNVM_REGISTER_OP(radians)
@@ -94,16 +87,14 @@ NNVM_REGISTER_OP(radians)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"radians"});
 
 NNVM_REGISTER_OP(_backward_radians)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::radians_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_radians"});
 
 // cosh
 NNVM_REGISTER_OP(cosh)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"cosh"});
 
 NNVM_REGISTER_OP(_backward_cosh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::cosh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_cosh"});
 
 // sinh
 NNVM_REGISTER_OP(sinh)
@@ -111,8 +102,7 @@ NNVM_REGISTER_OP(sinh)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"sinh"});
 
 NNVM_REGISTER_OP(_backward_sinh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::sinh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_sinh"});
 
 // tanh
 NNVM_REGISTER_OP(tanh)
@@ -120,8 +110,7 @@ NNVM_REGISTER_OP(tanh)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"tanh"});
 
 NNVM_REGISTER_OP(_backward_tanh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::tanh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_tanh"});
 
 // arcsinh
 NNVM_REGISTER_OP(arcsinh)
@@ -129,16 +118,14 @@ NNVM_REGISTER_OP(arcsinh)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"arcsinh"});
 
 NNVM_REGISTER_OP(_backward_arcsinh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::arcsinh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arcsinh"});
 
 // arccosh
 NNVM_REGISTER_OP(arccosh)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"arccosh"});
 
 NNVM_REGISTER_OP(_backward_arccosh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::arccosh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arccosh"});
 
 // arctanh
 NNVM_REGISTER_OP(arctanh)
@@ -146,8 +133,7 @@ NNVM_REGISTER_OP(arctanh)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryRTCCompute{"arctanh"});
 
 NNVM_REGISTER_OP(_backward_arctanh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::arctanh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arctanh"});
 
 }  // namespace op
 }  // namespace mxnet

From 18f1c91be3356c4efbe4053b2c8a3f7c39af7936 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 8 Jun 2020 13:04:22 -0700
Subject: [PATCH 10/64] Backward of binary ops

---
 src/common/cuda/rtc.cc                        |  25 ++
 src/common/cuda/rtc.h                         |  10 +
 src/common/cuda/rtc/backward_functions-inl.h  |  28 ++
 src/common/cuda/rtc/forward_functions-inl.h   |  10 +
 src/common/cuda/rtc/vectorization-inl.h       |  35 +--
 src/operator/nn/group_norm-inl.h              |   8 +-
 src/operator/nn/layer_norm-inl.h              |   8 +-
 src/operator/tensor/elemwise_binary_op.cc     | 258 +++++++++++++++++-
 src/operator/tensor/elemwise_binary_op.h      |  22 ++
 .../tensor/elemwise_binary_op_basic.cu        |  14 +-
 .../tensor/elemwise_binary_op_extended.cu     |  13 +-
 .../tensor/elemwise_binary_scalar_op.cc       |   2 +-
 src/operator/tensor/elemwise_unary_op.cc      |   3 +-
 src/operator/tensor/elemwise_unary_op.h       |   2 +-
 14 files changed, 370 insertions(+), 68 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 24f21d59c794..55d06e70846c 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -61,6 +61,31 @@ std::string to_string(OpReqType req) {
   LOG(FATAL) << "Unrecognized req.";
 }
 
+TypeInfo mshadow_type_info(int type_flag) {
+  using namespace mshadow;
+  switch (type_flag) {
+    case kFloat32:
+      return TypeInfo("float32", sizeof(float));
+    case kFloat64:
+      return TypeInfo("float64", sizeof(double));
+    case kFloat16:
+      return TypeInfo("float16", 2);
+    case kUint8:
+      return TypeInfo("uint8", sizeof(uint8_t));
+    case kInt32:
+      return TypeInfo("int32", sizeof(int32_t));
+    case kInt8:
+      return TypeInfo("int8", sizeof(int8_t));
+    case kInt64:
+      return TypeInfo("int64", sizeof(int64_t));
+    case kBool:
+      return TypeInfo("bool", sizeof(bool));
+    default:
+      LOG(FATAL) << "Unknown type flag " << type_flag;
+      return TypeInfo("INVALID", 1);
+  }
+}
+
 }  // namespace util
 
 namespace {
diff --git a/src/common/cuda/rtc.h b/src/common/cuda/rtc.h
index 799fbacf698c..8ca06b6cd1e2 100644
--- a/src/common/cuda/rtc.h
+++ b/src/common/cuda/rtc.h
@@ -48,6 +48,16 @@ namespace util {
 
 std::string to_string(OpReqType req);
 
+struct TypeInfo {
+  std::string name;
+  int size;
+
+  TypeInfo(const std::string name, const int size) :
+    name(std::move(name)), size(size) {}
+};
+
+TypeInfo mshadow_type_info(int type_flag);
+
 }
 
 extern std::mutex lock;
diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
index 44d39eeb17ba..cb388a0caf03 100644
--- a/src/common/cuda/rtc/backward_functions-inl.h
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -193,6 +193,18 @@ __device__ inline DType rdiv_grad(const DType val,
   return -val2 / (val * val);
 }
 
+template <typename DType, typename DType2>
+__device__ inline DType div_grad(const DType val,
+                                  const DType2 val2) {
+  return op::reciprocal(val2);
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType div_rgrad(const DType val,
+                                  const DType2 val2) {
+  return -val / (val2 * val2);
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType mod_grad(const DType val,
                                  const DType2 val2) {
@@ -203,6 +215,16 @@ __device__ inline DType mod_grad(const DType val,
   }
 }
 
+template <typename DType, typename DType2>
+__device__ inline DType mod_rgrad(const DType val,
+                                  const DType2 val2) {
+  if (type_util::is_integral<DType>::value) {
+    return 0;
+  } else {
+    return -op::floor(val / val2);
+  }
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType rmod_grad(const DType val,
                                   const DType2 val2) {
@@ -219,6 +241,12 @@ __device__ inline DType power_grad(const DType val,
   return op::power(val, val2 - 1.f) * val2;
 }
 
+template <typename DType, typename DType2>
+__device__ inline DType power_rgrad(const DType val,
+                                   const DType2 val2) {
+  return op::power(val, val2) * op::log(val);
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType rpower_grad(const DType val,
                                    const DType2 val2) {
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index e29ec3054f6e..509ff9ef5c0e 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -747,6 +747,16 @@ __device__ inline bool np_logical_not(const DType val) {
 
 #undef DEFINE_UNARY_MATH_FUNC
 
+template <typename DType, typename DType2>
+__device__ inline DType left(DType left_val, DType2 right_val) {
+  return left_val;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType2 right(DType left_val, DType2 right_val) {
+  return right_val;
+}
+
 }  // namespace op
 
 )code";
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index 34c33b198000..682be9207a97 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -240,39 +240,6 @@ index_t get_num_aligned_elements(const void *ptr, const index_t lead_dim,
   return (lead_dim + alignment + nvec - 1) / nvec;
 }
 
-struct TypeInfo {
-  std::string name;
-  int size;
-
-  TypeInfo(const std::string name, const int size) :
-    name(std::move(name)), size(size) {}
-};
-
-TypeInfo mshadow_type_info(int type_flag) {
-  using namespace mshadow;
-  switch (type_flag) {
-    case kFloat32:
-      return TypeInfo("float32", sizeof(float));
-    case kFloat64:
-      return TypeInfo("float64", sizeof(double));
-    case kFloat16:
-      return TypeInfo("float16", 2);
-    case kUint8:
-      return TypeInfo("uint8", sizeof(uint8_t));
-    case kInt32:
-      return TypeInfo("int32", sizeof(int32_t));
-    case kInt8:
-      return TypeInfo("int8", sizeof(int8_t));
-    case kInt64:
-      return TypeInfo("int64", sizeof(int64_t));
-    case kBool:
-      return TypeInfo("bool", sizeof(bool));
-    default:
-      LOG(FATAL) << "Unknown type flag " << type_flag;
-      return TypeInfo("INVALID", 1);
-  }
-}
-
 enum class Alignment {
   SAME_ALIGNED,  // All tensors aligned
   SAME_UNALIGNED,  // All tensors have the same misalignment
@@ -294,6 +261,7 @@ Alignment CheckAlignment(const Params& params, const index_t lead_dim,
                          const index_t other_dim, const int nvec,
                          const std::vector<TBlob> &inputs,
                          const std::vector<TBlob> &outputs) {
+  using namespace util;
   int align = -1;
 
   size_t i = 0;
@@ -356,6 +324,7 @@ void VectorizedKernelRTCLauncher(const std::string &code,
                                  const std::vector<TBlob> &inputs,
                                  const std::vector<TBlob> &outputs,
                                  const int dev_id) {
+  using namespace util;
   const index_t N = lead_dim * other_dim;
   if (N != 0) {
     index_t num_aligned_elements = get_num_aligned_elements(params.inputs[0], lead_dim, nvec,
diff --git a/src/operator/nn/group_norm-inl.h b/src/operator/nn/group_norm-inl.h
index deca04ee84a9..f7d25c91a4d1 100644
--- a/src/operator/nn/group_norm-inl.h
+++ b/src/operator/nn/group_norm-inl.h
@@ -294,8 +294,8 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
     ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
                                                         {kWriteTo}, {ograd_mult});
   } else {
-    ElemwiseBinaryRTCCompute{"mul"}(attrs, ctx, {normalized_data, ograd},
-                                    {kWriteTo}, {ograd_mult});
+    ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {normalized_data, ograd},
+                                     {kWriteTo}, {ograd_mult});
   }
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
@@ -335,8 +335,8 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
       ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
                                                           {kWriteTo}, {ograd_mult});
     } else {
-      ElemwiseBinaryRTCCompute{"mul"}(attrs, ctx, {ograd_mult, normalized_data},
-                                      {kWriteTo}, {ograd_mult});
+      ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
+                                       {kWriteTo}, {ograd_mult});
     }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index 085324b3b871..581319ea18b5 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -280,8 +280,8 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
     ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
                                                         {kWriteTo}, {ograd_mult});
   } else {
-    ElemwiseBinaryRTCCompute{"mul"}(attrs, ctx, {normalized_data, ograd},
-                                    {kWriteTo}, {ograd_mult});
+    ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {normalized_data, ograd},
+                                     {kWriteTo}, {ograd_mult});
   }
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
@@ -331,8 +331,8 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
                                                           {kWriteTo}, {ograd_mult});
     } else {
-      ElemwiseBinaryRTCCompute{"mul"}(attrs, ctx, {ograd_mult, normalized_data},
-                                      {kWriteTo}, {ograd_mult});
+      ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
+                                       {kWriteTo}, {ograd_mult});
     }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
index db9f6e022421..50df0e551131 100644
--- a/src/operator/tensor/elemwise_binary_op.cc
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -99,15 +99,15 @@ bool ElemwiseBinaryOp::BackwardUseInStorageType(const nnvm::NodeAttrs& attrs,
 #if MXNET_USE_CUDA
 
 struct binary_kernel_params {
-  const void *inputs[2];
-  void *outputs[1];
+  const void *inputs[3];
+  void *outputs[2];
 };
 
 const char binary_kernel_fwd[] = R"code(
 
 struct binary_kernel_params {
-  const void *inputs[2];
-  void *outputs[1];
+  const void *inputs[3];
+  void *outputs[2];
 };
 
 __global__ void binary_kernel(const binary_kernel_params params,
@@ -171,7 +171,7 @@ void ElemwiseBinaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
 
   const std::string code = std::string("const OpReqType req = ") +
                            util::to_string(req[0]) +
-                           ";\n" +
+                           ";\n"
                            "#define OP op::" +
                            OP +
                            "\n" +
@@ -179,8 +179,8 @@ void ElemwiseBinaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
   const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
 
   const index_t size = outputs[0].Size();
-  binary_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_},
-                                  {outputs[0].dptr_} };
+  binary_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_, nullptr},
+                                  {outputs[0].dptr_, nullptr} };
 
   VectorizedKernelRTCLauncher(code, "binary_kernel", nvec,
                               size, 1, s, params,
@@ -188,6 +188,250 @@ void ElemwiseBinaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                               ctx.run_ctx.get_ctx().dev_id);
 }
 
+const char binary_kernel_bwd_use_none[] = R"code(
+
+struct binary_kernel_params {
+  const void *inputs[3];
+  void *outputs[2];
+};
+
+__global__ void binary_kernel_bwd(const binary_kernel_params params,
+                                  const index_t lead_dim,
+                                  const index_t other_dim,
+                                  const index_t N,
+                                  const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> lstorer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+  VectorizedStorer<OutputType1, nvec, aligned> rstorer(
+    reinterpret_cast<OutputType1*>(params.outputs[1]), N);
+
+  using IType = AccType<InputType0>;
+  using OType0 = AccType<OutputType0>;
+  using OType1 = AccType<OutputType1>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader.load(tid, N);
+    if (lreq == OpReqType::kAddTo) {
+      lstorer.load(tid, N);
+    }
+    if (rreq == OpReqType::kAddTo) {
+      rstorer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto input = IType::from(loader.separate()[i]);
+      if (write_left_output) {
+        const auto temp = LOP(input);
+        if (lreq == OpReqType::kAddTo) {
+          // temp2 may have a wider type than either temp
+          // or OType
+          const auto temp2 = op::add(temp, OType0::from(lstorer.separate()[i]));
+          lstorer.separate()[i] = OType0::to(temp2);
+        } else {
+          lstorer.separate()[i] = OType0::to(temp);
+        }
+      }
+      if (write_right_output) {
+        const auto temp = ROP(input);
+        if (rreq == OpReqType::kAddTo) {
+          // temp2 may have a wider type than either temp
+          // or OType
+          const auto temp2 = op::add(temp, OType1::from(rstorer.separate()[i]));
+          rstorer.separate()[i] = OType1::to(temp2);
+        } else {
+          rstorer.separate()[i] = OType1::to(temp);
+        }
+      }
+    }
+    if (write_left_output) {
+      lstorer.store(tid, N);
+    }
+    if (write_right_output) {
+      rstorer.store(tid, N);
+    }
+  }
+}
+)code";
+
+void ElemwiseBinaryRTCBwdUseNone::operator()(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req[0] == kNullOp && req[1] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U);
+
+  bool write_left_output = req[0] != kNullOp &&
+                           (req[0] != kWriteInplace ||
+                           (req[0] == kWriteInplace && LOP != "identity"));
+
+  bool write_right_output = req[1] != kNullOp &&
+                            (req[1] != kWriteInplace ||
+                            (req[1] == kWriteInplace && LOP != "identity"));
+
+  const std::string code = std::string("const OpReqType lreq = ") +
+                           util::to_string(req[0]) +
+                           ";\n"
+                           "const OpReqType rreq = " +
+                           util::to_string(req[1]) +
+                           ";\n"
+                           "#define ROP op::" +
+                           ROP +
+                           "\n"
+                           "#define LOP op::" +
+                           LOP +
+                           "\n"
+                           "const bool write_left_output = " +
+                           std::to_string(write_left_output) +
+                           ";\n"
+                           "const bool write_right_output = " +
+                           std::to_string(write_right_output) +
+                           ";\n" +
+                           binary_kernel_bwd_use_none;
+  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+  const index_t size = outputs[0].Size();
+  binary_kernel_params params = { {inputs[0].dptr_, nullptr, nullptr},
+                                  {outputs[0].dptr_, outputs[1].dptr_} };
+
+  VectorizedKernelRTCLauncher(code, "binary_kernel_bwd", nvec,
+                              size, 1, s, params,
+                              inputs, outputs,
+                              ctx.run_ctx.get_ctx().dev_id);
+}
+
+const char binary_kernel_bwd_use_in[] = R"code(
+
+struct binary_kernel_params {
+  const void *inputs[3];
+  void *outputs[2];
+};
+
+__global__ void binary_kernel_bwd(const binary_kernel_params params,
+                                  const index_t lead_dim,
+                                  const index_t other_dim,
+                                  const index_t N,
+                                  const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> ograd_loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedLoader<InputType1, nvec, aligned> linput_loader(
+    reinterpret_cast<const InputType1*>(params.inputs[1]), N);
+  VectorizedLoader<InputType2, nvec, aligned> rinput_loader(
+    reinterpret_cast<const InputType2*>(params.inputs[2]), N);
+
+  VectorizedStorer<OutputType0, nvec, aligned> lstorer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+  VectorizedStorer<OutputType1, nvec, aligned> rstorer(
+    reinterpret_cast<OutputType1*>(params.outputs[1]), N);
+
+  using IType0 = AccType<InputType0>;
+  using IType1 = AccType<InputType1>;
+  using IType2 = AccType<InputType2>;
+  using OType0 = AccType<OutputType0>;
+  using OType1 = AccType<OutputType1>;
+
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    ograd_loader.load(tid, N);
+    linput_loader.load(tid, N);
+    rinput_loader.load(tid, N);
+    if (lreq == OpReqType::kAddTo) {
+      lstorer.load(tid, N);
+    }
+    if (rreq == OpReqType::kAddTo) {
+      rstorer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto ograd = IType0::from(ograd_loader.separate()[i]);
+      const auto linput = IType1::from(linput_loader.separate()[i]);
+      const auto rinput = IType2::from(rinput_loader.separate()[i]);
+
+      if (lreq != OpReqType::kNullOp) {
+        const auto temp = op::mul(ograd, LOP(linput, rinput));
+        if (lreq == OpReqType::kAddTo) {
+          const auto temp2 = op::add(temp, OType0::from(lstorer.separate()[i]));
+          lstorer.separate()[i] = OType0::to(temp2);
+        } else {
+          lstorer.separate()[i] = OType0::to(temp);
+        }
+      }
+
+      if (rreq != OpReqType::kNullOp) {
+        const auto temp = op::mul(ograd, ROP(linput, rinput));
+        if (rreq == OpReqType::kAddTo) {
+          const auto temp2 = op::add(temp, OType1::from(rstorer.separate()[i]));
+          rstorer.separate()[i] = OType1::to(temp2);
+        } else {
+          rstorer.separate()[i] = OType1::to(temp);
+        }
+      }
+    }
+    if (lreq != OpReqType::kNullOp) {
+      lstorer.store(tid, N);
+    }
+    if (rreq != OpReqType::kNullOp) {
+      rstorer.store(tid, N);
+    }
+  }
+}
+)code";
+
+void ElemwiseBinaryRTCBwdUseIn::operator()(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req[0] == kNullOp && req[1] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 2U);
+
+  const std::string code = std::string("const OpReqType lreq = ") +
+                           util::to_string(req[0]) +
+                           ";\n"
+                           "const OpReqType rreq = " +
+                           util::to_string(req[1]) +
+                           ";\n"
+                           "#define ROP op::" +
+                           ROP +
+                           "\n"
+                           "#define LOP op::" +
+                           LOP +
+                           "\n" +
+                           binary_kernel_bwd_use_in;
+  // Using 64 bit loads to reduce register pressure
+  int output_type_size = util::mshadow_type_info(outputs[0].type_flag_).size;
+  const int nvec = output_type_size <= sizeof(uint64_t)
+                     ? (sizeof(uint64_t) / output_type_size)
+                     : 1;
+
+  const index_t size = outputs[0].Size();
+  binary_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_, inputs[2].dptr_},
+                                  {outputs[0].dptr_, outputs[1].dptr_} };
+
+  VectorizedKernelRTCLauncher(code, "binary_kernel_bwd", nvec,
+                              size, 1, s, params,
+                              inputs, outputs,
+                              ctx.run_ctx.get_ctx().dev_id);
+}
+
 
 #endif  // MXNET_USE_CUDA
 
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 8c2a2cf92c08..156a88955550 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -877,6 +877,28 @@ struct ElemwiseBinaryRTCCompute {
                   const std::vector<TBlob>& outputs);
 };
 
+struct ElemwiseBinaryRTCBwdUseNone {
+  std::string LOP;
+  std::string ROP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
+struct ElemwiseBinaryRTCBwdUseIn {
+  std::string LOP;
+  std::string ROP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
 #endif
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index 927a833a5ac8..c0cf231596a1 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -227,8 +227,7 @@ NNVM_REGISTER_OP(_grad_add)
 
 NNVM_REGISTER_OP(_backward_add)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseNone<gpu, mshadow_op::identity,
-                    mshadow_op::identity>);
+                    ElemwiseBinaryRTCBwdUseNone{"identity", "identity"});
 
 NNVM_REGISTER_OP(elemwise_sub)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"sub"})
@@ -236,8 +235,7 @@ NNVM_REGISTER_OP(elemwise_sub)
 
 NNVM_REGISTER_OP(_backward_sub)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseNone<gpu, mshadow_op::identity,
-                    mshadow_op::negation>);
+                    ElemwiseBinaryRTCBwdUseNone{"identity", "negation"});
 
 NNVM_REGISTER_OP(elemwise_mul)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"mul"})
@@ -246,8 +244,7 @@ NNVM_REGISTER_OP(elemwise_mul)
 
 NNVM_REGISTER_OP(_backward_mul)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::right,
-                    mshadow_op::left>);
+                    ElemwiseBinaryRTCBwdUseIn{"right", "left"});
 
 NNVM_REGISTER_OP(elemwise_div)
 .set_attr<FCompute>("FCompute<gpu>",
@@ -255,15 +252,14 @@ NNVM_REGISTER_OP(elemwise_div)
 
 NNVM_REGISTER_OP(_backward_div)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::div_grad,
-                    mshadow_op::div_rgrad>);
+                    ElemwiseBinaryRTCBwdUseIn{"div_grad", "div_rgrad"});
 
 NNVM_REGISTER_OP(_mod)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_mod)
 .set_attr<FCompute>("FCompute<gpu>",
-  ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::mod_grad, mshadow_op::mod_rgrad>);
+  ElemwiseBinaryRTCBwdUseIn{"mod_grad","mod_rgrad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op_extended.cu b/src/operator/tensor/elemwise_binary_op_extended.cu
index 2bf608d53660..8f7087b9ac08 100644
--- a/src/operator/tensor/elemwise_binary_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_op_extended.cu
@@ -30,29 +30,26 @@ NNVM_REGISTER_OP(_power)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_power)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu,
-  mshadow_op::power_grad, mshadow_op::power_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCBwdUseIn{"power_grad","power_rgrad"});
 
 NNVM_REGISTER_OP(_maximum)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"max"});
 
 NNVM_REGISTER_OP(_backward_maximum)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::ge,
-  mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCBwdUseIn{"greater_equal", "less"});
 
 NNVM_REGISTER_OP(_minimum)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"min"});
 
 NNVM_REGISTER_OP(_backward_minimum)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::le,
-  mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCBwdUseIn{"less_equal", "greater"});
 
 NNVM_REGISTER_OP(_hypot)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"hypot"});
 
 NNVM_REGISTER_OP(_backward_hypot)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu,
-  mshadow_op::hypot_grad_left, mshadow_op::hypot_grad_right>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCBwdUseIn{"hypot_grad_left",
+                                                               "hypot_grad_right"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
index 82a2775c13e4..4fd4df391d69 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -225,7 +225,7 @@ void BinaryScalarRTCBackward::operator()(const nnvm::NodeAttrs& attrs,
 
   const std::string code = std::string("const OpReqType req = ") +
                            util::to_string(req[0]) +
-                           ";\n" +
+                           ";\n"
                            "#define OP op::" +
                            OP +
                            "\n" +
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
index 683a03d15031..c3e5369f9e67 100644
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <string>
 #include "elemwise_unary_op.h"
 
 #if MXNET_USE_CUDA
@@ -97,7 +98,7 @@ void UnaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
 
   const std::string code = std::string("const OpReqType req = ") +
                            util::to_string(req[0]) +
-                           ";\n" +
+                           ";\n"
                            "#define OP op::" +
                            OP +
                            "\n" +
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 46595d00388e..9e9448d056a3 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -31,6 +31,7 @@
 #include <utility>
 #include <algorithm>
 #include <climits>
+#include <string>
 #include "./cast_storage-inl.h"
 #include "../mshadow_op.h"
 #include "../mxnet_op.h"
@@ -929,7 +930,6 @@ struct UnaryRTCCompute {
                   const std::vector<NDArray>& inputs,
                   const std::vector<OpReqType>& req,
                   const std::vector<NDArray>& outputs);
-
 };
 
 #endif

From 2e27675dede8282f19ce285dd548ce1817c4cdcb Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 9 Jun 2020 10:20:41 -0700
Subject: [PATCH 11/64] Suuport broadcast

Add RTC to NumPy ops
---
 src/common/cuda/rtc.cc                        |   4 +-
 src/common/cuda/rtc.h                         |   2 +-
 src/common/cuda/rtc/backward_functions-inl.h  | 110 ++++--
 src/common/cuda/rtc/forward_functions-inl.h   |  48 ++-
 .../cuda/rtc/{type-inl.h => util-inl.h}       |  92 ++++-
 src/common/cuda/rtc/vectorization-inl.h       |  26 +-
 src/operator/nn/group_norm-inl.h              | 132 +++++--
 src/operator/nn/layer_norm-inl.h              | 122 ++++--
 .../numpy/np_elemwise_broadcast_logic_op.cu   |   2 +-
 .../numpy/np_elemwise_broadcast_op.cu         |  25 +-
 .../np_elemwise_broadcast_op_extended.cu      |  12 +-
 src/operator/tensor/broadcast_reduce-inl.cuh  | 239 ------------
 src/operator/tensor/broadcast_reduce-inl.h    |   4 +-
 .../tensor/elemwise_binary_broadcast_op.cc    | 348 ++++++++++++++++++
 .../tensor/elemwise_binary_broadcast_op.h     |  14 +
 .../elemwise_binary_broadcast_op_basic.cu     |  10 +-
 .../elemwise_binary_broadcast_op_extended.cu  |   8 +-
 .../elemwise_binary_broadcast_op_logic.cu     |  18 +-
 .../tensor/elemwise_binary_op_basic.cu        |   2 +-
 .../tensor/elemwise_binary_op_extended.cu     |   2 +-
 20 files changed, 797 insertions(+), 423 deletions(-)
 rename src/common/cuda/rtc/{type-inl.h => util-inl.h} (54%)
 create mode 100644 src/operator/tensor/elemwise_binary_broadcast_op.cc

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 55d06e70846c..9ec33074d93c 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -31,7 +31,7 @@
 
 #include "rtc.h"
 #include "rtc/half-inl.h"
-#include "rtc/type-inl.h"
+#include "rtc/util-inl.h"
 #include "rtc/forward_functions-inl.h"
 #include "rtc/backward_functions-inl.h"
 #include "rtc/vectorization-inl.h"
@@ -139,7 +139,7 @@ CUfunction get_function(const std::string &code,
     static std::string common_header =
         std::string(fp16_support_string) + "\n" +
         type_support_string + "\n" +
-        op_req_type_string + "\n" +
+        util_string + "\n" +
         float_limits() +
         special_functions_definitions + '\n' +
         function_definitions + "\n" +
diff --git a/src/common/cuda/rtc.h b/src/common/cuda/rtc.h
index 8ca06b6cd1e2..057ba5c79896 100644
--- a/src/common/cuda/rtc.h
+++ b/src/common/cuda/rtc.h
@@ -58,7 +58,7 @@ struct TypeInfo {
 
 TypeInfo mshadow_type_info(int type_flag);
 
-}
+}  // namespace util
 
 extern std::mutex lock;
 
diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
index cb388a0caf03..4d88e2639f38 100644
--- a/src/common/cuda/rtc/backward_functions-inl.h
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -32,158 +32,188 @@ const char backward_function_definitions[] = R"code(
 namespace op {
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_relu(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_relu(const DTypeGrad grad, const DType val) {
   if (isnan(val)) return val;
   return val > 0 ? grad : 0;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sigmoid(const DTypeGrad grad, const DType out) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_sigmoid(const DTypeGrad grad, const DType out) {
   return grad * out * (1 - out);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_softrelu(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_softrelu(const DTypeGrad grad, const DType val) {
   return grad * sigmoid(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_softsign(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_softsign(const DTypeGrad grad, const DType val) {
   const DType ap1 = 1 + op::abs(val);
   return grad / (ap1 * ap1);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_abs(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_abs(const DTypeGrad grad, const DType val) {
   return grad * op::sign(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_exp(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_exp(const DTypeGrad grad, const DType val) {
   return grad * op::exp(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_expm1(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_expm1(const DTypeGrad grad, const DType val) {
   return backward_exp(grad, val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_log(const DTypeGrad grad, const DType val) {
   return grad / val;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log10(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_log10(const DTypeGrad grad, const DType val) {
   return grad / (val * op::log(static_cast<DTypeGrad>(10)));
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log2(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_log2(const DTypeGrad grad, const DType val) {
   return grad / (val * op::log(static_cast<DTypeGrad>(2)));
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_log1p(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_log1p(const DTypeGrad grad, const DType val) {
   return grad / (1 + val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sin(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_sin(const DTypeGrad grad, const DType val) {
   return grad * op::cos(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cos(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_cos(const DTypeGrad grad, const DType val) {
   return -grad * op::sin(val);
 }
 
 // Uses output from tan
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_tan(const DTypeGrad grad, const DType out) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_tan(const DTypeGrad grad, const DType out) {
   return grad * (out * out + 1);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arcsin(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_arcsin(const DTypeGrad grad, const DType val) {
   return grad / op::sqrt(1 - val*val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arccos(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_arccos(const DTypeGrad grad, const DType val) {
   return -grad / op::sqrt(1 - val*val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arctan(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_arctan(const DTypeGrad grad, const DType val) {
   return grad / (1 + val*val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_degrees(const DTypeGrad grad, const DType /* val */) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_degrees(const DTypeGrad grad, const DType /* val */) {
   return op::degrees(grad);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_radians(const DTypeGrad grad, const DType /* val */) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_radians(const DTypeGrad grad, const DType /* val */) {
   return op::radians(grad);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sinh(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_sinh(const DTypeGrad grad, const DType val) {
   return grad * op::cosh(val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cosh(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_cosh(const DTypeGrad grad, const DType val) {
   return grad * op::sinh(val);
 }
 
 // Uses tanh output
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_tanh(const DTypeGrad grad, const DType out) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_tanh(const DTypeGrad grad, const DType out) {
   return grad * (1 - out * out);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arcsinh(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_arcsinh(const DTypeGrad grad, const DType val) {
   return grad / op::sqrt(val * val + 1);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arccosh(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_arccosh(const DTypeGrad grad, const DType val) {
   return grad / op::sqrt(val * val - 1);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_arctanh(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_arctanh(const DTypeGrad grad, const DType val) {
   return grad / (1 - val * val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_sqrt(const DTypeGrad grad, const DType out) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_sqrt(const DTypeGrad grad, const DType out) {
   return 0.5 * grad / out;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_rsqrt(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_rsqrt(const DTypeGrad grad, const DType val) {
   const DType inv = 1 / val;
   return -0.5 * grad * op::sqrt(inv) * inv;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_cbrt(const DTypeGrad grad, const DType out) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_cbrt(const DTypeGrad grad, const DType out) {
   return grad / (3.0f * out * out);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_rcbrt(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_rcbrt(const DTypeGrad grad, const DType val) {
   const DType inv = 1 / val;
   return -1.f/3.f * grad * op::cbrt(inv) * inv;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_square(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_square(const DTypeGrad grad, const DType val) {
   return 2 * val * grad;
 }
 
@@ -296,8 +326,9 @@ __device__ inline DType rldexp_grad(const DType val,
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_clip(const DTypeGrad grad, const DType val,
-                                          const float a_min, const float a_max) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_clip(const DTypeGrad grad, const DType val,
+              const float a_min, const float a_max) {
   if (val > a_max || val < a_min) {
     return 0;
   } else {
@@ -306,22 +337,26 @@ __device__ inline DTypeGrad backward_clip(const DTypeGrad grad, const DType val,
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_reciprocal(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_reciprocal(const DTypeGrad grad, const DType val) {
   return -grad / (val * val);
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_erf(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_erf(const DTypeGrad grad, const DType val) {
   return 2.0f / op::sqrt(pi) * op::exp(-(val*val)) * grad;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_erfinv(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_erfinv(const DTypeGrad grad, const DType val) {
   return 0.5f * op::sqrt(pi) * op::exp(val * val) * grad;
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_gamma(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_gamma(const DTypeGrad grad, const DType val) {
   if (type_util::is_same<DTypeGrad, double>::value) {
     return grad * op::gamma(val) * op::special_functions::cephes::psi<double>(val);
   } else {
@@ -330,7 +365,8 @@ __device__ inline DTypeGrad backward_gamma(const DTypeGrad grad, const DType val
 }
 
 template <typename DType, typename DTypeGrad>
-__device__ inline DTypeGrad backward_gammaln(const DTypeGrad grad, const DType val) {
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_gammaln(const DTypeGrad grad, const DType val) {
   if (type_util::is_same<DTypeGrad, double>::value) {
     return grad * op::special_functions::cephes::psi<double>(val);
   } else {
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 509ff9ef5c0e..2ac955ae4622 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -275,38 +275,45 @@ __device__ inline DType negation(const DType val) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType add(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+add(const DType a, const DType2 b) {
   return a + b;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType sub(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+sub(const DType a, const DType2 b) {
   return a - b;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType rsub(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rsub(const DType a, const DType2 b) {
   return b - a;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType mul(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+mul(const DType a, const DType2 b) {
   return a * b;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType div(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+div(const DType a, const DType2 b) {
   return a / b;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType rdiv(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rdiv(const DType a, const DType2 b) {
   return b / a;
 }
 
 #define DEFINE_BINARY_MATH_FUNC(name, double_version, float_version) \
 template <typename DType, typename DType2> \
-__device__ inline DType name (const DType a, const DType2 b) { \
+__device__ inline typename type_util::mixed_type<DType, DType2>::type \
+name (const DType a, const DType2 b) { \
   if (type_util::has_double_or_integral<DType, DType2>::value) { \
     return double_version ((double)a, (double)b); \
   } else { \
@@ -317,18 +324,21 @@ __device__ inline DType name (const DType a, const DType2 b) { \
 DEFINE_BINARY_MATH_FUNC(power, ::pow, ::powf)
 
 template <typename DType, typename DType2>
-__device__ inline DType rpow(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rpow(const DType a, const DType2 b) {
   return power(b, a);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType max(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+max(const DType a, const DType2 b) {
   if (isnan(a)) return a;
   return a > b ? a : b;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType min(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+min(const DType a, const DType2 b) {
   if (isnan(a)) return a;
   return a < b ? a : b;
 }
@@ -336,7 +346,8 @@ __device__ inline DType min(const DType a, const DType2 b) {
 DEFINE_BINARY_MATH_FUNC(hypot, ::hypot, ::hypotf)
 
 template <typename DType, typename DType2>
-__device__ inline DType mod(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+mod(const DType a, const DType2 b) {
   if (b == 0) {
     return 0;
   }
@@ -360,7 +371,8 @@ __device__ inline DType mod(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType rmod(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rmod(const DType a, const DType2 b) {
   return op::mod(b, a);
 }
 
@@ -451,7 +463,8 @@ __device__ inline DType rcopysign(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType lcm(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+lcm(const DType a, const DType2 b) {
   if (type_util::is_integral<DType>::value &&
       type_util::is_integral<DType2>::value) {
     DType A = a;
@@ -491,7 +504,8 @@ __device__ inline DType lcm(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType bitwise_xor(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_xor(const DType a,
+                                                                       const DType2 b) {
   return static_cast<int64_t>(a) ^ static_cast<int64_t>(b);
 }
 
@@ -499,14 +513,16 @@ __device__ inline DType bitwise_xor(const DType a, const DType2 b) {
 DEFINE_BINARY_MATH_FUNC(arctan2, ::atan2, ::atan2f)
 
 template <typename DType, typename DType2>
-__device__ inline DType rarctan2(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rarctan2(const DType a, const DType2 b) {
   return arctan2(b, a);
 }
 
 DEFINE_BINARY_MATH_FUNC(ldexp, a * ::pow(2.0f, b), a * ::powf(2.0f, b))
 
 template <typename DType, typename DType2>
-__device__ inline DType rldexp(const DType a, const DType2 b) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rldexp(const DType a, const DType2 b) {
   return ldexp(b, a);
 }
 
diff --git a/src/common/cuda/rtc/type-inl.h b/src/common/cuda/rtc/util-inl.h
similarity index 54%
rename from src/common/cuda/rtc/type-inl.h
rename to src/common/cuda/rtc/util-inl.h
index 888a1f734587..ddd49841da20 100644
--- a/src/common/cuda/rtc/type-inl.h
+++ b/src/common/cuda/rtc/util-inl.h
@@ -85,6 +85,62 @@ struct has_double_or_integral<A, B...> {
                                   has_double_or_integral<B...>::value;
 };
 
+template <bool b>
+struct enable_if {};
+
+template <>
+struct enable_if<true> {
+  using type = void;
+};
+
+template <typename T, typename U, class Enable = void>
+struct mixed_type;
+
+template <typename T>
+struct mixed_type<T, float64, typename enable_if<!is_same<float64, T>::value>::type> {
+  using type = float64;
+};
+
+template <typename T>
+struct mixed_type<float64, T> {
+  using type = float64;
+};
+
+template <typename T>
+struct mixed_type<T, float32, typename enable_if<!is_same<float64, T>::value &&
+                                                 !is_same<float32, T>::value>::type> {
+  using type = float32;
+};
+
+template <typename T>
+struct mixed_type<float32, T, typename enable_if<!is_same<float64, T>::value>::type> {
+  using type = float32;
+};
+
+template <typename T>
+struct mixed_type<T, float16, typename enable_if<is_same<float16, T>::value ||
+                                                 is_integral<T>::value>::type> {
+  using type = float16;
+};
+
+template <typename T>
+struct mixed_type<float16, T, typename enable_if<is_integral<T>::value>::type> {
+  using type = float16;
+};
+
+template <typename T, typename U>
+struct mixed_type<T, U, typename enable_if<is_integral<T>::value &&
+                                           is_integral<U>::value &&
+                                           sizeof(T) <= sizeof(U)>::type> {
+  using type = U;
+};
+
+template <typename T, typename U>
+struct mixed_type<U, T, typename enable_if<is_integral<T>::value &&
+                                           is_integral<U>::value &&
+                                           sizeof(T) < sizeof(U)>::type> {
+  using type = U;
+};
 
 }  // namespace type_util
 )code"
@@ -94,13 +150,47 @@ struct has_double_or_integral<A, B...> {
 "typedef int32 index_t;\n";
 #endif
 
-const char op_req_type_string[] = R"code(
+const char util_string[] = R"code(
 enum class OpReqType {
   kNullOp,
   kWriteTo,
   kWriteInplace,
   kAddTo
 };
+
+namespace util {
+
+constexpr int MAX_DIM = 5;
+
+template <int ndim>
+__device__ inline void unravel_dot(const index_t idx, const index_t (&shape)[MAX_DIM],
+  const index_t (&stridej)[MAX_DIM], const index_t (&stridek)[MAX_DIM], index_t* j, index_t* k) {
+  *j = 0;
+  *k = 0;
+  #pragma unroll
+  for (index_t i = ndim-1, idx_t = idx; i >=0; --i) {
+    const auto tmp = idx_t / shape[i];
+    const auto coord = idx_t - tmp*shape[i];
+    *j += coord*stridej[i];
+    *k += coord*stridek[i];
+    idx_t = tmp;
+  }
+}
+
+template<int ndim>
+__device__ inline index_t unravel_dot(const index_t idx, const index_t (&shape)[MAX_DIM],
+  const index_t (&stride)[MAX_DIM]) {
+  index_t ret = 0;
+  #pragma unroll
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
+    ret += (j - tmp*shape[i])*stride[i];
+    j = tmp;
+  }
+  return ret;
+}
+
+}  // namespace util
 )code";
 }  // namespace rtc
 }  // namespace cuda
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index 682be9207a97..dbabc20cbb3b 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -316,25 +316,18 @@ constexpr int vectorized_kernel_thread_num = 512;
 template <typename Params>
 void VectorizedKernelRTCLauncher(const std::string &code,
                                  const std::string &kernel_name,
-                                 const int nvec,
+                                 int nvec,
                                  const index_t lead_dim,
                                  const index_t other_dim,
                                  mshadow::Stream<gpu> *s,
                                  const Params params,
                                  const std::vector<TBlob> &inputs,
                                  const std::vector<TBlob> &outputs,
-                                 const int dev_id) {
+                                 const int dev_id,
+                                 const int lead_input_num = 0) {
   using namespace util;
   const index_t N = lead_dim * other_dim;
   if (N != 0) {
-    index_t num_aligned_elements = get_num_aligned_elements(params.inputs[0], lead_dim, nvec,
-                                                            mshadow_type_info(inputs[0].type_flag_)
-                                                            .size);
-    size_t num_elements = other_dim * num_aligned_elements;
-    constexpr int threads = vectorized_kernel_thread_num;
-    constexpr int max_blocks = 65535;
-    index_t blocks = std::min(static_cast<int>((num_elements + threads - 1) / threads),
-                              max_blocks);
     auto align = CheckAlignment(params, lead_dim, other_dim,
                                 nvec, inputs, outputs);
     std::stringstream kernel_builder;
@@ -383,20 +376,27 @@ void VectorizedKernelRTCLauncher(const std::string &code,
                        << std::endl;
         break;
       case Alignment::DIFFERENT: {
-        num_aligned_elements = lead_dim * other_dim;
-        blocks = std::min(static_cast<int>((num_aligned_elements + threads - 1) / threads),
-                          max_blocks);
         // If the pointers are aligned differently we cannot vectorize
         kernel_builder << "const bool aligned = true;"
                        << std::endl
                        << "const int nvec = 1;"
                        << std::endl;
+        nvec = 1;
         break;
       }
     }
 
     kernel_builder << code;
 
+    index_t num_aligned_elements = get_num_aligned_elements(
+                                    params.inputs[lead_input_num],
+                                    lead_dim, nvec,
+                                    mshadow_type_info(inputs[lead_input_num].type_flag_).size);
+    size_t num_elements = other_dim * num_aligned_elements;
+    constexpr int threads = vectorized_kernel_thread_num;
+    constexpr int max_blocks = 65535;
+    index_t blocks = std::min(static_cast<int>((num_elements + threads - 1) / threads),
+                              max_blocks);
     std::vector<const void*> args = {&params, &lead_dim, &other_dim,
                                      &N, &num_aligned_elements};
     auto function = common::cuda::rtc::get_function(kernel_builder.str(),
diff --git a/src/operator/nn/group_norm-inl.h b/src/operator/nn/group_norm-inl.h
index f7d25c91a4d1..d4f5adb1804f 100644
--- a/src/operator/nn/group_norm-inl.h
+++ b/src/operator/nn/group_norm-inl.h
@@ -139,9 +139,17 @@ void GroupNormCompute(const nnvm::NodeAttrs& attrs,
   const TBlob& output_grp = outputs[groupnorm::kOut].reshape(temp_data_shape);
 
   // Calculate data = data - mean
-  BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                     {data_grp, mean_grp},
-                                                     {kWriteTo}, {output_grp});
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                       {data_grp, mean_grp},
+                                                       {kWriteTo}, {output_grp});
+  } else {
+#if MXNET_USE_CUDA
+    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                      {data_grp, mean_grp},
+                                      {kWriteTo}, {output_grp});
+#endif
+  }
 
   // Calculate std
   const TBlob centered_out = outputs[groupnorm::kOut].reshape(red_src_shape);
@@ -156,9 +164,17 @@ void GroupNormCompute(const nnvm::NodeAttrs& attrs,
   });
 
   // Calculate data = data / std
-  BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
-                                               {output_grp, std_grp},
-                                               {kWriteTo}, {output_grp});
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
+                                                 {output_grp, std_grp},
+                                                 {kWriteTo}, {output_grp});
+  } else {
+#if MXNET_USE_CUDA
+    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                      {output_grp, std_grp},
+                                      {kWriteTo}, {output_grp});
+#endif
+  }
 
   const TBlob& output = outputs[groupnorm::kOut];
   mxnet::TShape new_param_shape(data_shape.ndim(), 1);
@@ -167,14 +183,27 @@ void GroupNormCompute(const nnvm::NodeAttrs& attrs,
   const TBlob& gamma = inputs[groupnorm::kGamma].reshape(new_param_shape);
   const TBlob& beta = inputs[groupnorm::kBeta].reshape(new_param_shape);
 
-  // Calculate data = data * gamma
-  BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                   {output, gamma},
-                                                   {kWriteTo}, {output});
-  // Calculate data = data + beta
-  BinaryBroadcastCompute<xpu, op::mshadow_op::plus>(attrs, ctx,
-                                                   {output, beta},
-                                                   {kWriteTo}, {output});
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    // Calculate data = data * gamma
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                     {output, gamma},
+                                                     {kWriteTo}, {output});
+    // Calculate data = data + beta
+    BinaryBroadcastCompute<xpu, op::mshadow_op::plus>(attrs, ctx,
+                                                     {output, beta},
+                                                     {kWriteTo}, {output});
+  } else {
+#if MXNET_USE_CUDA
+    // Calculate data = data * gamma
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                      {output, gamma},
+                                      {kWriteTo}, {output});
+    // Calculate data = data + beta
+    BinaryBroadcastRTCCompute {"add"}(attrs, ctx,
+                                      {output, beta},
+                                      {kWriteTo}, {output});
+#endif
+  }
 }
 
 /*
@@ -273,12 +302,23 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
   const TBlob red_out = TBlob(workspace.dptr_ + reduce_workspace_size + data_size * 2,
                               mean_.shape_, mean.dev_mask(), mean.type_flag_, mean.dev_id());
   // Compute normalized_data = (data - mean) / std
-  BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                    {data_, mean_},
-                                                    {kWriteTo}, {normalized_data});
-  BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
-                                                   {normalized_data, std_},
-                                                   {kWriteTo}, {normalized_data});
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                      {data_, mean_},
+                                                      {kWriteTo}, {normalized_data});
+    BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
+                                                     {normalized_data, std_},
+                                                     {kWriteTo}, {normalized_data});
+  } else {
+#if MXNET_USE_CUDA
+    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                      {data_, mean_},
+                                      {kWriteTo}, {normalized_data});
+    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                      {normalized_data, std_},
+                                      {kWriteTo}, {normalized_data});
+#endif
+  }
   // Calculate grad_beta
   if (req[2] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[2].type_flag_, DType, {
@@ -294,8 +334,10 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
     ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
                                                         {kWriteTo}, {ograd_mult});
   } else {
+#if MXNET_USE_CUDA
     ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {normalized_data, ograd},
                                      {kWriteTo}, {ograd_mult});
+#endif
   }
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
@@ -313,12 +355,25 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
   //               + normalized_data * (-mean(normalized_data * ograd_mult, axis))
   if (req[0] != kNullOp) {
     const TBlob output_ = outputs[0].reshape(data_.shape_);
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                    {inputs[0], gamma},
-                                                    {kWriteTo}, {ograd_mult.reshape(data.shape_)});
-    BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
-                                                    {ograd_mult, std_},
-                                                    {kWriteTo}, {ograd_mult});
+    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                      {inputs[0], gamma},
+                                                      {kWriteTo},
+                                                      {ograd_mult.reshape(data.shape_)});
+      BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
+                                                      {ograd_mult, std_},
+                                                      {kWriteTo}, {ograd_mult});
+    } else {
+#if MXNET_USE_CUDA
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                        {inputs[0], gamma},
+                                        {kWriteTo},
+                                        {ograd_mult.reshape(data.shape_)});
+      BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                        {ograd_mult, std_},
+                                        {kWriteTo}, {ograd_mult});
+#endif
+    }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
         broadcast::Reduce<mshadow_op::sum, NDim, DType, op::mshadow_op::identity, true>(
@@ -328,15 +383,20 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /= scalar<DType>(N);
     });
-    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                      {ograd_mult, red_out},
-                                                      {req[0]}, {output_});
     if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                        {ograd_mult, red_out},
+                                                        {req[0]}, {output_});
       ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
                                                           {kWriteTo}, {ograd_mult});
     } else {
+#if MXNET_USE_CUDA
+      BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                        {ograd_mult, red_out},
+                                        {req[0]}, {output_});
       ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
                                        {kWriteTo}, {ograd_mult});
+#endif
     }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
@@ -347,9 +407,17 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /= scalar<DType>(-N);
     });
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                     {normalized_data, red_out},
-                                                     {kAddTo}, {output_});
+    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                       {normalized_data, red_out},
+                                                       {kAddTo}, {output_});
+    } else {
+#if MXNET_USE_CUDA
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                        {normalized_data, red_out},
+                                        {kAddTo}, {output_});
+#endif
+    }
   }
 }
 
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index 581319ea18b5..cde484a2290d 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -137,9 +137,17 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
     });
   });
   // Calculate data = data - mean
-  BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                     {inputs[0], outputs[layernorm::kMean]},
-                                                     {kWriteTo}, {outputs[0]});
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                       {inputs[0], outputs[layernorm::kMean]},
+                                                       {kWriteTo}, {outputs[0]});
+  } else {
+#if MXNET_USE_CUDA
+    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                      {inputs[0], outputs[layernorm::kMean]},
+                                      {kWriteTo}, {outputs[0]});
+#endif
+  }
   // Calculate std
   const TBlob centered_out = outputs[0].reshape(red_src_shape);
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -156,18 +164,35 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
                         + scalar<DType>(param.eps));
     });
   });
-  // Calculate data = data / std
-  BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
-                                               {outputs[0], outputs[layernorm::kStd]},
-                                               {kWriteTo}, {outputs[0]});
-  // Calculate data = data * gamma
-  BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
-                                               {outputs[0], gamma},
-                                               {kWriteTo}, {outputs[0]});
-  // Calculate data = data + beta
-  BinaryBroadcastCompute<xpu, mshadow_op::plus>(attrs, ctx,
-                                                {outputs[0], beta},
-                                                {kWriteTo}, {outputs[0]});
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    // Calculate data = data / std
+    BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
+                                                 {outputs[0], outputs[layernorm::kStd]},
+                                                 {kWriteTo}, {outputs[0]});
+    // Calculate data = data * gamma
+    BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
+                                                 {outputs[0], gamma},
+                                                 {kWriteTo}, {outputs[0]});
+    // Calculate data = data + beta
+    BinaryBroadcastCompute<xpu, mshadow_op::plus>(attrs, ctx,
+                                                  {outputs[0], beta},
+                                                  {kWriteTo}, {outputs[0]});
+  } else {
+#if MXNET_USE_CUDA
+    // Calculate data = data / std
+    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                      {outputs[0], outputs[layernorm::kStd]},
+                                      {kWriteTo}, {outputs[0]});
+    // Calculate data = data * gamma
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                      {outputs[0], gamma},
+                                      {kWriteTo}, {outputs[0]});
+    // Calculate data = data + beta
+    BinaryBroadcastRTCCompute {"add"}(attrs, ctx,
+                                      {outputs[0], beta},
+                                      {kWriteTo}, {outputs[0]});
+#endif
+  }
 }
 
 template<typename xpu>
@@ -252,12 +277,23 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
   const TBlob red_out = TBlob(workspace.dptr_ + reduce_workspace_size + data_size * 2,
                               mean.shape_, mean.dev_mask(), mean.type_flag_, mean.dev_id());
   // Compute normalized_data = (data - mean) / std
-  BinaryBroadcastCompute<xpu, mshadow_op::minus>(attrs, ctx,
-                                                 {data, mean},
+  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+    BinaryBroadcastCompute<xpu, mshadow_op::minus>(attrs, ctx,
+                                                   {data, mean},
+                                                   {kWriteTo}, {normalized_data});
+    BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
+                                                 {normalized_data, std},
                                                  {kWriteTo}, {normalized_data});
-  BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
-                                               {normalized_data, std},
-                                               {kWriteTo}, {normalized_data});
+  } else {
+#if MXNET_USE_CUDA
+    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                      {data, mean},
+                                      {kWriteTo}, {normalized_data});
+    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                      {normalized_data, std},
+                                      {kWriteTo}, {normalized_data});
+#endif
+  }
   // Calculate grad_beta
   bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
   if (req[2] != kNullOp) {
@@ -303,12 +339,23 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
   //   grad_data = ograd_mult - mean(ograd_mult, axis)
   //               + normalized_data * (-mean(normalized_data * ograd_mult, axis))
   if (req[0] != kNullOp) {
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                    {ograd, gamma},
-                                                    {kWriteTo}, {ograd_mult});
-    BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
-                                                    {ograd_mult, std},
-                                                    {kWriteTo}, {ograd_mult});
+    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                      {ograd, gamma},
+                                                      {kWriteTo}, {ograd_mult});
+      BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
+                                                      {ograd_mult, std},
+                                                      {kWriteTo}, {ograd_mult});
+    } else {
+#if MXNET_USE_CUDA
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                        {ograd, gamma},
+                                        {kWriteTo}, {ograd_mult});
+      BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                        {ograd_mult, std},
+                                        {kWriteTo}, {ograd_mult});
+#endif
+    }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
         if (safe_acc) {
@@ -324,15 +371,18 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /= scalar<DType>(channel_size);
     });
-    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                      {ograd_mult, red_out},
-                                                      {req[0]}, {outputs[0]});
     if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                        {ograd_mult, red_out},
+                                                        {req[0]}, {outputs[0]});
       ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
                                                           {kWriteTo}, {ograd_mult});
     } else {
       ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
                                        {kWriteTo}, {ograd_mult});
+      BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                        {ograd_mult, red_out},
+                                        {req[0]}, {outputs[0]});
     }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
@@ -349,9 +399,17 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /=  scalar<DType>(- channel_size);
     });
-    BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
-                                                 {normalized_data, red_out},
-                                                 {kAddTo}, {outputs[0]});
+    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
+      BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
+                                                   {normalized_data, red_out},
+                                                   {kAddTo}, {outputs[0]});
+    } else {
+#if MXNET_USE_CUDA
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                        {normalized_data, red_out},
+                                        {kAddTo}, {outputs[0]});
+#endif
+    }
   }
 }
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.cu b/src/operator/numpy/np_elemwise_broadcast_logic_op.cu
index 42acbea563e6..90c5bb465a73 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.cu
@@ -34,7 +34,7 @@ namespace op {
 
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_LOGIC_GPU(name)                                     \
   NNVM_REGISTER_OP(_npi_##name)                                                               \
-  .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastComputeLogic<gpu, mshadow_op::np_##name>)
+  .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"np_" #name})
 
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_GPU(name)                               \
   NNVM_REGISTER_OP(_npi_##name##_scalar)                                                       \
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index 82958924d933..2fe56d2787b4 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -29,50 +29,35 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_add)
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::plus, op::mshadow_op::mixed_plus,
-                                      op::mshadow_op::mixed_plus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"add"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_add)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::posone,
                                                                 mshadow_op::posone>);
 
 NNVM_REGISTER_OP(_npi_subtract)
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastCompute<gpu, op::mshadow_op::minus, op::mshadow_op::mixed_minus,
-                              op::mshadow_op::mixed_rminus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"sub"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_sub)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::posone,
                                                                 mshadow_op::negone>);
 
 NNVM_REGISTER_OP(_npi_multiply)
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::mul, op::mshadow_op::mixed_mul,
-                                      op::mshadow_op::mixed_mul>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mul"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::right,
                                                               mshadow_op::left>);
 
 NNVM_REGISTER_OP(_npi_mod)
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastCompute<gpu, op::mshadow_op::mod, op::mshadow_op::mixed_mod,
-                                      op::mshadow_op::mixed_rmod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mod)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::mod_grad,
                                                               mshadow_op::mod_rgrad>);
 
 NNVM_REGISTER_OP(_npi_power)
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::power, op::mshadow_op::mixed_power,
-                                      op::mshadow_op::mixed_rpower>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_power)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::power_grad,
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
index 61919da85aa7..96d907eda8e0 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
@@ -29,16 +29,16 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_copysign)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::copysign>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"copysign"});
 
 NNVM_REGISTER_OP(_npi_lcm)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastIntCompute<gpu, mshadow_op::lcm>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"lcm"});
 
 NNVM_REGISTER_OP(_npi_bitwise_and)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastIntCompute<gpu, mshadow_op::bitwise_and>);
 
 NNVM_REGISTER_OP(_npi_bitwise_xor)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastIntCompute<gpu, mshadow_op::bitwise_xor>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"bitwise_xor"});
 
 NNVM_REGISTER_OP(_npi_bitwise_or)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastIntCompute<gpu, mshadow_op::bitwise_or>);
@@ -48,13 +48,13 @@ NNVM_REGISTER_OP(_backward_npi_copysign)
                                                                   mshadow_op::copysign_rgrad>);
 
 NNVM_REGISTER_OP(_npi_arctan2)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::arctan2>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"arctan2"});
 
 NNVM_REGISTER_OP(_backward_npi_arctan2)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::arctan2_grad,
                                                                   mshadow_op::arctan2_rgrad>);
 NNVM_REGISTER_OP(_npi_hypot)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::hypot>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"hypot"});
 
 NNVM_REGISTER_OP(_backward_npi_hypot)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::hypot_grad_left,
@@ -94,7 +94,7 @@ NNVM_REGISTER_OP(_npi_bitwise_or_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::ComputeInt<gpu, mshadow_op::bitwise_or>);
 
 NNVM_REGISTER_OP(_npi_ldexp)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::ldexp>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"ldexp"});
 
 NNVM_REGISTER_OP(_npi_ldexp_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"ldexp"});
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 790f1afb857c..15fc1d879d7d 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -28,245 +28,6 @@
 
 using namespace mshadow::cuda;
 
-template <int ndim, typename DType>
-struct VectorizedBinaryBroadcastParam {
-  const DType* inputs[2];
-  DType* outputs[1];
-  Shape<ndim> stride[2];
-  Shape<ndim> oshape;
-  index_t size[2];
-};
-
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
-
-template <bool aligned, typename DType, typename LType, typename OP, int ndim, int req>
-__global__ void VectorizedBinaryBroadcastKernel(
-    const VectorizedBinaryBroadcastParam<ndim, DType> param,
-    const index_t lead_dim, const index_t other_dim,
-    const index_t num_aligned_elements) {
-  constexpr int nvec = sizeof(LType) / sizeof(DType);
-  const index_t M = num_aligned_elements * other_dim;
-  const index_t N = lead_dim * other_dim;
-
-  VectorizedLoader<DType, LType, aligned> lloader(param.inputs[0], param.size[0]);
-  VectorizedLoader<DType, LType, aligned> rloader(param.inputs[1], param.size[1]);
-
-  for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-       idx < M;
-       idx += gridDim.x * blockDim.x) {
-    DType * current_output_pointer;
-    index_t output_size;
-    index_t output_idx;
-    if (aligned) {
-      // Simplified case
-      index_t lindex, rindex;
-      unravel_dot(idx * nvec, param.oshape,
-                  param.stride[0], param.stride[1],
-                  &lindex, &rindex);
-      lloader.load(lindex / nvec, param.size[0]);
-      rloader.load(rindex / nvec, param.size[1]);
-      current_output_pointer = param.outputs[0];
-      output_size = N;
-      output_idx = idx;
-    } else {
-      const index_t row = idx / num_aligned_elements;
-      const index_t lead_dim_idx = idx - row * num_aligned_elements;
-
-      index_t lindex, rindex;
-      const index_t original_idx = max(lead_dim_idx * nvec - lloader.alignment(),
-                                       static_cast<index_t>(0)) +
-                                   row * lead_dim;
-      unravel_dot(original_idx, param.oshape,
-                  param.stride[0], param.stride[1],
-                  &lindex, &rindex);
-      lloader.load((lindex + lloader.alignment()) / nvec, param.size[0]);
-      rloader.load((rindex + lloader.alignment()) / nvec, param.size[1]);
-      current_output_pointer = param.outputs[0] + row * lead_dim;
-      output_size = lead_dim;
-      output_idx = lead_dim_idx;
-    }
-    VectorizedStorer<DType, LType, aligned> storer(current_output_pointer, output_size);
-
-    if (req == kAddTo) {
-      storer.load(output_idx, output_size);
-    }
-#pragma unroll
-    for (int i = 0; i < lloader.nvec(); ++i) {
-      DType temp = OP::Map(lloader.separate()[i],
-                           rloader.separate()[i]);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(output_idx, output_size);
-  }
-}
-
-template <bool aligned, typename DType, typename LType, typename OP, int ndim, int req, int side>
-__global__ void VectorizedBinaryBroadcastSingleSideKernel(
-    const VectorizedBinaryBroadcastParam<ndim, DType> param,
-    const index_t lead_dim, const index_t other_dim,
-    const index_t num_aligned_elements) {
-  constexpr int nvec = sizeof(LType) / sizeof(DType);
-  const index_t M = num_aligned_elements * other_dim;
-  const index_t N = lead_dim * other_dim;
-  constexpr int other_side = 1 - side;
-
-  VectorizedLoader<DType, LType, aligned> lloader(param.inputs[side], param.size[side]);
-
-  for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-       idx < M;
-       idx += gridDim.x * blockDim.x) {
-    index_t original_idx;
-    DType * current_output_pointer;
-    index_t output_size;
-    index_t output_idx;
-    if (aligned) {
-      //Simplified case
-      original_idx = idx * nvec;
-      const index_t lindex = mxnet_op::unravel_dot(original_idx, param.oshape,
-                                                   param.stride[side]);
-      lloader.load(lindex / nvec, param.size[side]);
-      current_output_pointer = param.outputs[0];
-      output_size = N;
-      output_idx = idx;
-    } else {
-      const index_t row = idx / num_aligned_elements;
-      const index_t lead_dim_idx = idx - row * num_aligned_elements;
-      original_idx = lead_dim_idx * nvec -
-                     lloader.alignment() + row * lead_dim;
-      const index_t original_idx_clamped = max(lead_dim_idx * nvec - lloader.alignment(),
-                                               static_cast<index_t>(0)) +
-                                           row * lead_dim;
-      const index_t lindex = mxnet_op::unravel_dot(original_idx_clamped, param.oshape,
-                                                   param.stride[side]);
-      lloader.load((lindex + lloader.alignment()) / nvec, param.size[side]);
-      current_output_pointer = param.outputs[0] + row * lead_dim;
-      output_size = lead_dim;
-      output_idx = lead_dim_idx;
-    }
-    VectorizedStorer<DType, LType, aligned> storer(current_output_pointer, output_size);
-
-    if (req == kAddTo) {
-      storer.load(output_idx, output_size);
-    }
-#pragma unroll
-    for (int i = 0; i < lloader.nvec(); ++i) {
-      const index_t rindex = min(max(mxnet_op::unravel_dot(original_idx + i,
-                                                           param.oshape,
-                                                           param.stride[other_side]),
-                                     static_cast<index_t>(0)),
-                                 param.size[other_side] - 1);
-      DType rinput = param.inputs[other_side][rindex];
-      DType temp;
-      if (side == 0) {
-        // Left side is vectorized
-        temp = OP::Map(lloader.separate()[i],
-                       rinput);
-      } else {
-        // Right side is vectorized
-        temp = OP::Map(rinput,
-                       lloader.separate()[i]);
-      }
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(output_idx, output_size);
-  }
-}
-
-template <typename DType, typename OP, int req, int ndim>
-class VectorizedBinaryBroadcastFwd {
- public:
-  using ParamType = VectorizedBinaryBroadcastParam<ndim, DType>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t other_dim) {
-    int common_shape = 1;
-    int first_different = -1;
-    for (int i = ndim - 1; i >= 0; --i) {
-      if (params.stride[0][i] == params.stride[1][i]) {
-        common_shape *= params.oshape[i];
-      } else {
-        first_different = i;
-        break;
-      }
-    }
-
-    if (common_shape != 1) {
-      VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], lead_dim);
-      const index_t num_elements_per_row = loader.num_aligned_elements();
-      VectorizedBinaryBroadcastKernel<aligned, DType, LType, OP, ndim, req>
-        <<<blocks, threads, 0, stream>>>(params, lead_dim, other_dim, num_elements_per_row);
-    } else {
-      if (params.stride[0][first_different] == 0) {
-        VectorizedLoader<DType, LType, aligned> loader(params.inputs[1], lead_dim);
-        const index_t num_elements_per_row = loader.num_aligned_elements();
-        VectorizedBinaryBroadcastSingleSideKernel<aligned, DType, LType, OP, ndim, req, 1>
-          <<<blocks, threads, 0, stream>>>(params, lead_dim, other_dim, num_elements_per_row);
-      } else {
-        VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], lead_dim);
-        const index_t num_elements_per_row = loader.num_aligned_elements();
-        VectorizedBinaryBroadcastSingleSideKernel<aligned, DType, LType, OP, ndim, req, 0>
-          <<<blocks, threads, 0, stream>>>(params, lead_dim, other_dim, num_elements_per_row);
-      }
-    }
-  }
-};
-
-template<int ndim, typename DType, typename OP>
-void BinaryBroadcastComputeImpl(Stream<gpu> *s, const OpReqType req,
-                                const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
-  using common::cuda::VectorizedKernelLauncher;
-  if (req == kNullOp) return;
-  cudaStream_t stream = Stream<gpu>::GetStream(s);
-  const index_t N = out.shape_.Size();
-
-  Shape<ndim> lstride = mxnet_op::calc_stride(lhs.shape_.get<ndim>());
-  Shape<ndim> rstride = mxnet_op::calc_stride(rhs.shape_.get<ndim>());
-
-  MXNET_ASSIGN_REQ_SWITCH(req, Req, {
-    using LType = uint2;
-    using Kernel = VectorizedBinaryBroadcastFwd<DType, OP, Req, ndim>;
-
-    typename Kernel::ParamType param;
-
-    param.inputs[0] = lhs.dptr<DType>();
-    param.inputs[1] = rhs.dptr<DType>();
-    param.outputs[0] = out.dptr<DType>();
-    param.stride[0] = lstride;
-    param.stride[1] = rstride;
-    param.oshape = out.shape_.get<ndim>();
-    param.size[0] = lhs.shape_.Size();
-    param.size[1] = rhs.shape_.Size();
-
-    index_t lead_dim = 1;
-    for (int i = ndim - 1; i >= 0; --i) {
-      /* Find the first non-1 dimension
-         to check the alignment
-      */
-      if (param.oshape[i] != 1) {
-        lead_dim = param.oshape[i];
-        break;
-      }
-    }
-    const index_t other_dim = out.shape_.Size() / lead_dim;
-
-    VectorizedKernelLauncher<DType, LType, Kernel>(lead_dim, other_dim, s, param);
-  });
-}
-
 const int nthread_reduce = kMaxThreadsPerBlock;
 template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP, int unroll>
 __launch_bounds__(nthread_reduce)
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index d511791a82da..dd1505d9357c 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -289,8 +289,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const
 
 #ifdef __CUDACC__
 #include "broadcast_reduce-inl.cuh"
-
-#else
+#endif
 
 template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
@@ -479,7 +478,6 @@ void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
     lhs.shape_.get<ndim>(), rhs.shape_.get<ndim>());
 }
 
-#endif
 }  // namespace broadcast
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
new file mode 100644
index 000000000000..26c277dfe071
--- /dev/null
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -0,0 +1,348 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <string>
+
+#include "broadcast_reduce-inl.h"
+#include "elemwise_binary_broadcast_op.h"
+
+#if MXNET_USE_CUDA
+#include "../../common/cuda/rtc/vectorization-inl.h"
+#include "../../common/cuda/rtc.h"
+#endif  // MXNET_USE_CUDA
+
+namespace mxnet {
+namespace op {
+
+#if MXNET_USE_CUDA
+
+struct binary_broadcast_params {
+  const void* inputs[2];
+  void* outputs[1];
+  index_t stride[2][broadcast::MAX_DIM];
+  index_t oshape[broadcast::MAX_DIM];
+  index_t size[2];
+};
+
+const char broadcast_kernel_fwd[] = R"code(
+struct binary_broadcast_params {
+  const void* inputs[2];
+  void* outputs[1];
+  index_t stride[2][util::MAX_DIM];
+  index_t oshape[util::MAX_DIM];
+  index_t size[2];
+};
+
+__global__ void binary_broadcast_kernel(
+    const binary_broadcast_params param,
+    const index_t lead_dim,
+    const index_t other_dim,
+    const index_t N,
+    const index_t num_aligned_elements) {
+  using namespace vector;
+  const index_t M = num_aligned_elements * other_dim;
+
+  VectorizedLoader<InputType0, nvec, aligned> lloader(
+    reinterpret_cast<const InputType0*>(param.inputs[0]), param.size[0]);
+  VectorizedLoader<InputType1, nvec, aligned> rloader(
+    reinterpret_cast<const InputType1*>(param.inputs[1]), param.size[1]);
+
+  using IType0 = AccType<InputType0>;
+  using IType1 = AccType<InputType1>;
+  using OType = AccType<OutputType0>;
+
+
+  for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < M;
+       idx += gridDim.x * blockDim.x) {
+    OutputType0 * current_output_pointer;
+    index_t output_size;
+    index_t output_idx;
+    if (aligned) {
+      // Simplified case
+      index_t lindex, rindex;
+      util::unravel_dot<ndim>(idx * nvec, param.oshape,
+                              param.stride[0], param.stride[1],
+                              &lindex, &rindex);
+      lloader.load(lindex / nvec, param.size[0]);
+      rloader.load(rindex / nvec, param.size[1]);
+      current_output_pointer = reinterpret_cast<OutputType0*>(param.outputs[0]);
+      output_size = N;
+      output_idx = idx;
+    } else {
+      const index_t row = idx / num_aligned_elements;
+      const index_t lead_dim_idx = idx - row * num_aligned_elements;
+
+      index_t lindex, rindex;
+      const index_t original_idx = max(lead_dim_idx * nvec - lloader.alignment(),
+                                       static_cast<index_t>(0)) +
+                                   row * lead_dim;
+      util::unravel_dot<ndim>(original_idx, param.oshape,
+                              param.stride[0], param.stride[1],
+                              &lindex, &rindex);
+      lloader.load((lindex + lloader.alignment()) / nvec, param.size[0]);
+      rloader.load((rindex + lloader.alignment()) / nvec, param.size[1]);
+      current_output_pointer = reinterpret_cast<OutputType0*>(param.outputs[0]) + row * lead_dim;
+      output_size = lead_dim;
+      output_idx = lead_dim_idx;
+    }
+    VectorizedStorer<OutputType0, nvec, aligned> storer(current_output_pointer, output_size);
+
+    if (req == OpReqType::kAddTo) {
+      storer.load(output_idx, output_size);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto temp = OP(lloader.separate()[i],
+                           rloader.separate()[i]);
+
+      if (req == OpReqType::kAddTo) {
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(output_idx, output_size);
+  }
+}
+)code";
+
+const char single_side_broadcast_kernel_fwd[] = R"code(
+struct binary_broadcast_params {
+  const void* inputs[2];
+  void* outputs[1];
+  index_t stride[2][util::MAX_DIM];
+  index_t oshape[util::MAX_DIM];
+  index_t size[2];
+};
+
+__global__ void single_side_binary_broadcast_kernel(
+    const binary_broadcast_params param,
+    const index_t lead_dim,
+    const index_t other_dim,
+    const index_t N,
+    const index_t num_aligned_elements) {
+  using namespace vector;
+  const index_t M = num_aligned_elements * other_dim;
+  constexpr int other_side = 1 - side;
+
+  VectorizedLoader<DType, nvec, aligned> lloader(
+    reinterpret_cast<const DType*>(param.inputs[side]), param.size[side]);
+
+  using IType = AccType<DType>;
+  using IType2 = AccType<DType2>;
+  using OType = AccType<OutputType0>;
+
+
+  for (index_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+       idx < M;
+       idx += gridDim.x * blockDim.x) {
+    index_t original_idx;
+    OutputType0 * current_output_pointer;
+    index_t output_size;
+    index_t output_idx;
+    if (aligned) {
+      // Simplified case
+      original_idx = idx * nvec;
+      const index_t lindex = util::unravel_dot<ndim>(original_idx, param.oshape,
+                                                     param.stride[side]);
+      lloader.load(lindex / nvec, param.size[side]);
+      current_output_pointer = reinterpret_cast<OutputType0*>(param.outputs[0]);
+      output_size = N;
+      output_idx = idx;
+    } else {
+      const index_t row = idx / num_aligned_elements;
+      const index_t lead_dim_idx = idx - row * num_aligned_elements;
+      original_idx = lead_dim_idx * nvec -
+                     lloader.alignment() + row * lead_dim;
+      const index_t original_idx_clamped = max(lead_dim_idx * nvec - lloader.alignment(),
+                                               static_cast<index_t>(0)) +
+                                           row * lead_dim;
+      const index_t lindex = util::unravel_dot<ndim>(original_idx_clamped, param.oshape,
+                                                     param.stride[side]);
+      lloader.load((lindex + lloader.alignment()) / nvec, param.size[side]);
+      current_output_pointer = reinterpret_cast<OutputType0*>(param.outputs[0]) + row * lead_dim;
+      output_size = lead_dim;
+      output_idx = lead_dim_idx;
+    }
+    VectorizedStorer<OutputType0, nvec, aligned> storer(current_output_pointer, output_size);
+
+    if (req == OpReqType::kAddTo) {
+      storer.load(output_idx, output_size);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const index_t rindex = min(max(util::unravel_dot<ndim>(original_idx + i,
+                                                             param.oshape,
+                                                             param.stride[other_side]),
+                                     static_cast<index_t>(0)),
+                                 param.size[other_side] - 1);
+      const auto rinput = IType2::from(
+                            reinterpret_cast<const InputType1*>(param.inputs[other_side])
+                            [rindex]);
+
+      typename OType::type temp;
+      if (side == 0) {
+        // Left side is vectorized
+        temp = OP(lloader.separate()[i],
+                  rinput);
+      } else {
+        // Right side is vectorized
+        temp = OP(rinput,
+                  lloader.separate()[i]);
+      }
+
+      if (req == OpReqType::kAddTo) {
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(output_idx, output_size);
+  }
+}
+)code";
+namespace {
+
+std::vector<index_t> calc_stride(const mxnet::TShape& shape, int ndim) {
+  CHECK_EQ(ndim, shape.ndim());
+  std::vector<index_t> stride(ndim);
+  index_t cumprod = 1;
+  for (int i = shape.ndim() - 1; i >= 0; --i) {
+    stride[i] = (shape[i] > 1) ? cumprod : 0;
+    cumprod *= shape[i];
+  }
+  return stride;
+}
+
+}  // namespace
+
+void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                                           const OpContext& ctx,
+                                           const std::vector<TBlob>& inputs,
+                                           const std::vector<OpReqType>& req,
+                                           const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (outputs[0].shape_.Size() == 0U) return;
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  mxnet::TShape new_lshape, new_rshape, new_oshape;
+  int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
+                                         &new_lshape, &new_rshape, &new_oshape);
+  // Pad the ndim
+  BROADCAST_NDIM_SWITCH(ndim, NDim, {
+      if (ndim != 0) {
+        ndim = NDim;
+      }
+  });
+
+  if (!ndim) {
+    ElemwiseBinaryRTCCompute {OP}(attrs, ctx, inputs, req, outputs);
+  } else {
+    mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+    const TBlob& lhs = inputs[0].reshape(new_lshape);
+    const TBlob& rhs = inputs[1].reshape(new_rshape);
+    const TBlob& output = outputs[0].reshape(new_oshape);
+
+    const auto& lstride = calc_stride(lhs.shape_, ndim);
+    const auto& rstride = calc_stride(rhs.shape_, ndim);
+
+    size_t output_type_size = util::mshadow_type_info(outputs[0].type_flag_).size;
+    const int nvec = output_type_size <= sizeof(uint64_t)
+                       ? (sizeof(uint64_t) / output_type_size)
+                       : 1;
+    binary_broadcast_params params{};
+    params.inputs[0] = lhs.dptr_;
+    params.inputs[1] = rhs.dptr_;
+    params.outputs[0] = output.dptr_;
+    for (int i = 0; i < ndim; ++i) {
+      params.stride[0][i] = lstride[i];
+      params.stride[1][i] = rstride[i];
+      params.oshape[i] = new_oshape[i];
+    }
+    params.size[0] = lhs.shape_.Size();
+    params.size[1] = rhs.shape_.Size();
+
+    index_t lead_dim = 1;
+    for (int i = ndim - 1; i >= 0; --i) {
+      /* Find the first non-1 dimension
+         to check the alignment
+      */
+      if (params.oshape[i] != 1) {
+        lead_dim = params.oshape[i];
+        break;
+      }
+    }
+    const index_t other_dim = output.shape_.Size() / lead_dim;
+
+    int first_different = -1;
+    int common_shape = 1;
+    for (int i = ndim - 1; i >= 0; --i) {
+      if (params.stride[0][i] == params.stride[1][i]) {
+        common_shape *= params.oshape[i];
+      } else {
+        first_different = i;
+        break;
+      }
+    }
+
+    int lead_input_num = 0;
+    std::string code = std::string("const OpReqType req = ") +
+                       util::to_string(req[0]) +
+                       ";\n"
+                       "#define OP op::" +
+                       OP +
+                       "\n"
+                       "const int ndim = " +
+                       std::to_string(ndim) +
+                       ";\n";
+    std::string kernel_name;
+    if (common_shape != 1) {
+      code += broadcast_kernel_fwd;
+      kernel_name = "binary_broadcast_kernel";
+    } else {
+      if (params.stride[0][first_different] == 0) {
+        lead_input_num = 1;
+        code += "const int side = 1;\n"
+                "using DType = InputType1;\n"
+                "using DType2 = InputType0;\n";
+      } else {
+        code += "const int side = 0;\n"
+                "using DType = InputType0;\n"
+                "using DType2 = InputType1;\n";
+      }
+      code += single_side_broadcast_kernel_fwd;
+      kernel_name = "single_side_binary_broadcast_kernel";
+    }
+
+    VectorizedKernelRTCLauncher(code, kernel_name, nvec,
+                                lead_dim, other_dim, s, params,
+                                inputs, outputs,
+                                ctx.run_ctx.get_ctx().dev_id,
+                                lead_input_num);
+  }
+}
+
+#endif  // MXNET_USE_CUDA
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index a47ff4cedff6..b2be0dfadd6d 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -247,6 +247,20 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
   }
 }
 
+#if MXNET_USE_CUDA
+
+struct BinaryBroadcastRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
+#endif  // MXNET_USE_CUDA
+
 template<typename xpu, typename OP>
 void BinaryBroadcastComputeWithBool(const nnvm::NodeAttrs& attrs,
                                     const OpContext& ctx,
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
index aa0850ac5bbf..e9a75c509fd4 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
@@ -29,7 +29,7 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(broadcast_add)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::plus>)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"add"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", BinaryBroadcastComputeDenseEx<gpu, op::mshadow_op::plus>);
 
 NNVM_REGISTER_OP(_backward_broadcast_add)
@@ -37,7 +37,7 @@ NNVM_REGISTER_OP(_backward_broadcast_add)
                                                                 mshadow_op::identity>);
 
 NNVM_REGISTER_OP(broadcast_sub)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::minus>)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"sub"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", BinaryBroadcastComputeDenseEx<gpu, op::mshadow_op::minus>);
 
 NNVM_REGISTER_OP(_backward_broadcast_sub)
@@ -45,7 +45,7 @@ NNVM_REGISTER_OP(_backward_broadcast_sub)
                                                                 mshadow_op::negation>);
 
 NNVM_REGISTER_OP(broadcast_mul)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::mul>)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mul"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", BinaryBroadcastComputeSparseEx<gpu, op::mshadow_op::mul>);
 
 NNVM_REGISTER_OP(_backward_broadcast_mul)
@@ -53,7 +53,7 @@ NNVM_REGISTER_OP(_backward_broadcast_mul)
                                                                 mshadow_op::left>);
 
 NNVM_REGISTER_OP(broadcast_div)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, op::mshadow_op::div>)
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"div"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", BinaryBroadcastComputeSparseEx<gpu, op::mshadow_op::div>);
 
 NNVM_REGISTER_OP(_backward_broadcast_div)
@@ -61,7 +61,7 @@ NNVM_REGISTER_OP(_backward_broadcast_div)
                                                                 mshadow_op::div_rgrad>);
 
 NNVM_REGISTER_OP(broadcast_mod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::mod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_broadcast_mod)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::mod_grad,
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
index e8e79f726b65..3ad4bdc430ce 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
@@ -29,28 +29,28 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(broadcast_power)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::power>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_broadcast_power)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::power_grad,
                                                               mshadow_op::power_rgrad>);
 
 NNVM_REGISTER_OP(broadcast_maximum)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::maximum>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"max"});
 
 NNVM_REGISTER_OP(_backward_broadcast_maximum)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::ge,
                                                               mshadow_op::lt>);
 
 NNVM_REGISTER_OP(broadcast_minimum)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::minimum>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"min"});
 
 NNVM_REGISTER_OP(_backward_broadcast_minimum)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::le,
                                                               mshadow_op::gt>);
 
 NNVM_REGISTER_OP(broadcast_hypot)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::hypot>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"hypot"});
 
 NNVM_REGISTER_OP(_backward_broadcast_hypot)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::hypot_grad_left,
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
index 4bec07b7096a..bd2f50a23566 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
@@ -30,31 +30,31 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(broadcast_equal)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::eq>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"equal"});
 
 NNVM_REGISTER_OP(broadcast_not_equal)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::ne>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"not_equal"});
 
 NNVM_REGISTER_OP(broadcast_greater)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"greater"});
 
 NNVM_REGISTER_OP(broadcast_greater_equal)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::ge>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"greater_equal"});
 
 NNVM_REGISTER_OP(broadcast_lesser)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"less"});
 
 NNVM_REGISTER_OP(broadcast_lesser_equal)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::le>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"less_equal"});
 
 NNVM_REGISTER_OP(broadcast_logical_and)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::logical_and>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"logical_and"});
 
 NNVM_REGISTER_OP(broadcast_logical_or)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::logical_or>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"logical_or"});
 
 NNVM_REGISTER_OP(broadcast_logical_xor)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::logical_xor>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"logical_xor"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index c0cf231596a1..cb0da7554bc3 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -259,7 +259,7 @@ NNVM_REGISTER_OP(_mod)
 
 NNVM_REGISTER_OP(_backward_mod)
 .set_attr<FCompute>("FCompute<gpu>",
-  ElemwiseBinaryRTCBwdUseIn{"mod_grad","mod_rgrad"});
+  ElemwiseBinaryRTCBwdUseIn{"mod_grad", "mod_rgrad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op_extended.cu b/src/operator/tensor/elemwise_binary_op_extended.cu
index 8f7087b9ac08..9d568a404f3d 100644
--- a/src/operator/tensor/elemwise_binary_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_op_extended.cu
@@ -30,7 +30,7 @@ NNVM_REGISTER_OP(_power)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_power)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCBwdUseIn{"power_grad","power_rgrad"});
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCBwdUseIn{"power_grad", "power_rgrad"});
 
 NNVM_REGISTER_OP(_maximum)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"max"});

From 3ea932f9e77b6d1ba1352800948e22cea0134347 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 11 Jun 2020 16:09:17 -0700
Subject: [PATCH 12/64] RTC for elementwise sum

Fixes
---
 src/common/cuda/rtc.cc                        |   4 +
 src/common/cuda/rtc/forward_functions-inl.h   |  14 +-
 src/common/cuda/rtc/vectorization-inl.h       |   1 +
 src/operator/nn/layer_norm-inl.h              |   4 +-
 .../tensor/elemwise_binary_broadcast_op.cc    |  10 +-
 src/operator/tensor/elemwise_binary_op.cc     |   2 +-
 src/operator/tensor/elemwise_sum.cu           | 151 ++++++++++--------
 7 files changed, 108 insertions(+), 78 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 9ec33074d93c..29be7fa6199d 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -202,6 +202,10 @@ CUfunction get_function(const std::string &code,
     CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cu_device));
     // Jit-compile ptx for the driver's current context
     CUmodule module;
+    std::ofstream f("debug.ptx");
+    f << kinfo.ptx;
+    f.close();
+
     CUDA_DRIVER_CALL(cuModuleLoadData(&module, kinfo.ptx.c_str()));
     CUDA_DRIVER_CALL(cuModuleGetFunction(&kinfo.functions[dev_id],
                                          module,
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 2ac955ae4622..83b915b5b0e5 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -317,7 +317,7 @@ name (const DType a, const DType2 b) { \
   if (type_util::has_double_or_integral<DType, DType2>::value) { \
     return double_version ((double)a, (double)b); \
   } else { \
-    return float_version (a, b); \
+    return float_version ((float)a, (float)b); \
   } \
 }
 
@@ -506,7 +506,7 @@ lcm(const DType a, const DType2 b) {
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_xor(const DType a,
                                                                        const DType2 b) {
-  return static_cast<int64_t>(a) ^ static_cast<int64_t>(b);
+  return static_cast<int64>(a) ^ static_cast<int64>(b);
 }
 
 
@@ -518,7 +518,15 @@ rarctan2(const DType a, const DType2 b) {
   return arctan2(b, a);
 }
 
-DEFINE_BINARY_MATH_FUNC(ldexp, a * ::pow(2.0f, b), a * ::powf(2.0f, b))
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+ldexp(const DType a, const DType2 b) {
+  if (type_util::has_double_or_integral<DType, DType2>::value) {
+    return a * ::pow(2.0, static_cast<double>(b));
+  } else {
+    return a * ::powf(2.0f, static_cast<float>(b));
+  }
+}
 
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index dbabc20cbb3b..debe5b417015 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -327,6 +327,7 @@ void VectorizedKernelRTCLauncher(const std::string &code,
                                  const int lead_input_num = 0) {
   using namespace util;
   const index_t N = lead_dim * other_dim;
+  nvec = std::min(nvec, 4);  // Use at most 4-wide vectors
   if (N != 0) {
     auto align = CheckAlignment(params, lead_dim, other_dim,
                                 nvec, inputs, outputs);
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index cde484a2290d..a9b356cf4f96 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -378,11 +378,11 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
                                                           {kWriteTo}, {ograd_mult});
     } else {
-      ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
-                                       {kWriteTo}, {ograd_mult});
       BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
                                         {ograd_mult, red_out},
                                         {req[0]}, {outputs[0]});
+      ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
+                                       {kWriteTo}, {ograd_mult});
     }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index 26c277dfe071..7f8b7a55ffe7 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -109,8 +109,8 @@ __global__ void binary_broadcast_kernel(
     }
 #pragma unroll
     for (int i = 0; i < nvec; ++i) {
-      const auto temp = OP(lloader.separate()[i],
-                           rloader.separate()[i]);
+      const auto temp = OP(IType0::from(lloader.separate()[i]),
+                           IType1::from(rloader.separate()[i]));
 
       if (req == OpReqType::kAddTo) {
         const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
@@ -195,18 +195,18 @@ __global__ void single_side_binary_broadcast_kernel(
                                      static_cast<index_t>(0)),
                                  param.size[other_side] - 1);
       const auto rinput = IType2::from(
-                            reinterpret_cast<const InputType1*>(param.inputs[other_side])
+                            reinterpret_cast<const DType2*>(param.inputs[other_side])
                             [rindex]);
 
       typename OType::type temp;
       if (side == 0) {
         // Left side is vectorized
-        temp = OP(lloader.separate()[i],
+        temp = OP(IType::from(lloader.separate()[i]),
                   rinput);
       } else {
         // Right side is vectorized
         temp = OP(rinput,
-                  lloader.separate()[i]);
+                  IType::from(lloader.separate()[i]));
       }
 
       if (req == OpReqType::kAddTo) {
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
index 50df0e551131..d52886b8842c 100644
--- a/src/operator/tensor/elemwise_binary_op.cc
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -124,7 +124,7 @@ __global__ void binary_kernel(const binary_kernel_params params,
     reinterpret_cast<OutputType0*>(params.outputs[0]), N);
 
   using IType0 = AccType<InputType0>;
-  using IType1 = AccType<InputType0>;
+  using IType1 = AccType<InputType1>;
   using OType = AccType<OutputType0>;
 
   const index_t M = num_aligned_elements;
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index bc58834e5fc0..bf212ef8242e 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -24,116 +24,133 @@
 */
 #include "./elemwise_sum.h"
 #include "../../ndarray/ndarray_function.h"
-#include "../../common/cuda/vectorization.cuh"
+#include "../../common/cuda/rtc.h"
+#include "../../common/cuda/rtc/vectorization-inl.h"
 
 namespace mxnet {
 namespace op {
 
-using common::cuda::VectorizedKernelLauncher;
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
-
 namespace {
 
 constexpr size_t num_inputs_per_kernel = 4;
 
-template <typename DType, int NumInputs>
-struct VectorizedElementwiseSumKernelParams {
+struct elementwise_sum_params {
+  int num_inputs;
+  const void* inputs[num_inputs_per_kernel];
+  void* outputs[1];
+};
+
+const char elementwise_sum_kernel[] = R"code(
+constexpr size_t num_inputs_per_kernel = 4;
+
+struct elementwise_sum_params {
   int num_inputs;
-  const DType* inputs[NumInputs];
-  DType* outputs[1];
+  const void* inputs[num_inputs_per_kernel];
+  void* outputs[1];
 };
 
-template <bool aligned, typename DType, typename LType, int req>
-__launch_bounds__(mxnet::common::cuda::vectorized_kernel_thread_num)
-__global__ void VectorizedElementwiseSumKernel(
-    const VectorizedElementwiseSumKernelParams<DType, num_inputs_per_kernel> params,
-    const index_t N) {
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
+__global__ void elementwise_sum_kernel(
+    const elementwise_sum_params params,
+    const index_t lead_dim,
+    const index_t other_dim,
+    const index_t N,
+    const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
 
-  const index_t M = storer.num_aligned_elements();
+  using IType = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
 
   for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
       tid < M;
       tid += gridDim.x * blockDim.x) {
-    if (req == kAddTo) {
+    typename OType::type temp[nvec];
+    if (req == OpReqType::kAddTo) {
       storer.load(tid, N);
+#pragma unroll
+      for (int i = 0; i < nvec; ++i) {
+        temp[i] = OType::from(storer.separate()[i]);
+      }
     } else {
 #pragma unroll
-      for (int i = 0; i < storer.nvec(); ++i) {
-        storer.separate()[i] = 0;
+      for (int i = 0; i < nvec; ++i) {
+        temp[i] = 0;
       }
     }
 #pragma unroll
     for (int i = 0; i < num_inputs_per_kernel; ++i) {
       if (i < params.num_inputs) {
-        VectorizedLoader<DType, LType, aligned> loader(params.inputs[i], N);
+        VectorizedLoader<InputType0, nvec, aligned> loader(
+          reinterpret_cast<const InputType0*>(params.inputs[i]), N);
         loader.load(tid, N);
 #pragma unroll
-        for (int i = 0; i < loader.nvec(); ++i) {
-          storer.separate()[i] += loader.separate()[i];
+        for (int i = 0; i < nvec; ++i) {
+          temp[i] += IType::from(loader.separate()[i]);
         }
       }
     }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      storer.separate()[i] = OType::to(temp[i]);
+    }
 
     storer.store(tid, N);
   }
 }
-
-
-template <typename DType, int req>
-class VectorizedElementwiseSumFwd {
- public:
-  using ParamType = VectorizedElementwiseSumKernelParams<DType, num_inputs_per_kernel>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedElementwiseSumKernel<aligned, DType, LType, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
+)code";
 
 void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
                               const OpContext &ctx,
                               const std::vector<TBlob> &inputs,
                               const std::vector<OpReqType> &req,
                               const std::vector<TBlob> &outputs) {
+  using namespace mxnet::common::cuda::rtc;
   mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
   if (req[0] == kNullOp) return;
   CHECK_EQ(outputs.size(), 1U);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint2;
-      const index_t size = inputs[0].Size();
-      for (size_t i = 0; i < inputs.size(); i += num_inputs_per_kernel) {
-        if (i == 0) {
-          using Kernel = VectorizedElementwiseSumFwd<DType, Req>;
-          typename Kernel::ParamType params;
-          params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
-          for (int j = 0; j < params.num_inputs; ++j) {
-            params.inputs[j] = inputs[i + j].dptr<DType>();
-          }
-          params.outputs[0] = outputs[0].dptr<DType>();
-          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-        } else {
-          /* During subsequent launches we need to
-             accumulate into the previous outputs
-          */
-          using Kernel = VectorizedElementwiseSumFwd<DType, kAddTo>;
-          typename Kernel::ParamType params;
-          params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
-          for (int j = 0; j < params.num_inputs; ++j) {
-            params.inputs[j] = inputs[i + j].dptr<DType>();
-          }
-          params.outputs[0] = outputs[0].dptr<DType>();
-          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-        }
+  size_t output_type_size = util::mshadow_type_info(outputs[0].type_flag_).size;
+  const int nvec = output_type_size <= sizeof(uint2)
+                     ? (sizeof(uint2) / output_type_size)
+                     : 1;
+  const index_t size = inputs[0].Size();
+  for (size_t i = 0; i < inputs.size(); i += num_inputs_per_kernel) {
+    if (i == 0) {
+      const std::string code = std::string("const OpReqType req = ") +
+                               util::to_string(req[0]) +
+                               ";\n" +
+                               elementwise_sum_kernel;
+      elementwise_sum_params params{};
+      params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
+      for (int j = 0; j < params.num_inputs; ++j) {
+        params.inputs[j] = inputs[i + j].dptr_;
+      }
+      params.outputs[0] = outputs[0].dptr_;
+      VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel", nvec,
+                                  size, 1, s, params,
+                                  inputs, outputs,
+                                  ctx.run_ctx.get_ctx().dev_id);
+    } else {
+      /* During subsequent launches we need to
+         accumulate into the previous outputs
+      */
+      const std::string code = std::string("const OpReqType req = OpReqType::kAddTo;\n") +
+                               elementwise_sum_kernel;
+      elementwise_sum_params params{};
+      params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
+      for (int j = 0; j < params.num_inputs; ++j) {
+        params.inputs[j] = inputs[i + j].dptr_;
       }
-    });
-  });
+      params.outputs[0] = outputs[0].dptr_;
+      const std::vector<TBlob> new_inputs(inputs.begin() + i, inputs.end());
+      VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel", nvec,
+                                  size, 1, s, params,
+                                  new_inputs, outputs,
+                                  ctx.run_ctx.get_ctx().dev_id);
+    }
+  }
 }
 
 void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs,

From 9da292c097427e050bd81daa4f21d690b3b90867 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 15 Jun 2020 13:40:16 -0700
Subject: [PATCH 13/64] RTC for backward usenone of broadcast

---
 src/common/cuda/rtc.cc                        |   4 +-
 src/common/cuda/rtc/reducer-inl.h             | 103 ++++
 src/common/cuda/rtc/util-inl.h                |  18 +
 src/operator/tensor/broadcast_reduce-inl.cuh  |  32 +-
 src/operator/tensor/broadcast_reduce-inl.h    |  31 ++
 .../tensor/elemwise_binary_broadcast_op.cc    |  60 +++
 .../tensor/elemwise_binary_broadcast_op.h     |  30 +-
 .../elemwise_binary_broadcast_op_basic.cu     |   6 +-
 src/operator/tensor/reduce_rtc.cc             | 500 ++++++++++++++++++
 9 files changed, 748 insertions(+), 36 deletions(-)
 create mode 100644 src/common/cuda/rtc/reducer-inl.h
 create mode 100644 src/operator/tensor/reduce_rtc.cc

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 29be7fa6199d..46b26229a0ce 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -36,6 +36,7 @@
 #include "rtc/backward_functions-inl.h"
 #include "rtc/vectorization-inl.h"
 #include "rtc/special_functions-inl.h"
+#include "rtc/reducer-inl.h"
 #include "utils.h"
 
 
@@ -144,7 +145,8 @@ CUfunction get_function(const std::string &code,
         special_functions_definitions + '\n' +
         function_definitions + "\n" +
         backward_function_definitions + "\n" +
-        vectorization_support_string + "\n";
+        vectorization_support_string + "\n" +
+        reducer + "\n";
     std::string code_with_header = common_header + code;
     // If verbose mode, output kernel source, though not including the common header
     if (dmlc::GetEnv("MXNET_RTC_VERBOSE", false)) {
diff --git a/src/common/cuda/rtc/reducer-inl.h b/src/common/cuda/rtc/reducer-inl.h
new file mode 100644
index 000000000000..b57a2fcbaee2
--- /dev/null
+++ b/src/common/cuda/rtc/reducer-inl.h
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_COMMON_CUDA_RTC_REDUCER_INL_H_
+#define MXNET_COMMON_CUDA_RTC_REDUCER_INL_H_
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+namespace rtc {
+
+const char reducer[] = R"code(
+
+namespace red {
+
+/*! \brief sum reducer */
+struct sum {
+  /*! \brief do reduction into dst */
+  template<typename DType, typename DType2>
+  __device__ inline static void Reduce(volatile DType& dst,  volatile DType2 src) {
+    dst = op::add(dst, src);
+  }
+
+  /*! \brief do stable reduction into dst */
+  template<typename DType, typename DType2>
+  __device__ inline static void Reduce(volatile DType& dst,  volatile DType2 src,
+                                       volatile DType& residual) {
+    DType y = op::sub(src, residual);
+    DType t = dst + y;
+    if (isinf(t)) {
+      residual = 0;
+    } else {
+      residual = (t - dst) - y;
+    }
+    dst = t;
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  __device__ inline static void Merge(volatile DType& dst_val, volatile DType& src_val) {
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  __device__ inline static void Merge(volatile DType& dst_val, volatile DType& dst_residual,
+                                      volatile DType& src_val, volatile DType& src_residual) {
+    DType t1 = dst_val + src_val;
+    if (isinf(t1)) {
+      dst_val = t1;
+      dst_residual = 0;
+    } else {
+      DType e = t1 - dst_val;
+      DType t2 = ((src_val - e) + (dst_val - (t1 - e))) + dst_residual + src_residual;
+      dst_val = t1 + t2;
+      dst_residual = t2 - (dst_val - t1);
+    }
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  __device__ inline static void SetInitValue(DType &initv) {
+    initv = 0;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  __device__ inline static void SetInitValue(DType &initv, DType &residual) {
+    SetInitValue(initv);
+    residual = 0;
+  }
+};
+}  // namespace red
+
+)code";
+
+}  // namespace rtc
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#endif  // MXNET_COMMON_CUDA_RTC_REDUCER_INL_H_
+
diff --git a/src/common/cuda/rtc/util-inl.h b/src/common/cuda/rtc/util-inl.h
index ddd49841da20..02c7d83acf69 100644
--- a/src/common/cuda/rtc/util-inl.h
+++ b/src/common/cuda/rtc/util-inl.h
@@ -190,6 +190,24 @@ __device__ inline index_t unravel_dot(const index_t idx, const index_t (&shape)[
   return ret;
 }
 
+template<int ndim>
+__device__ inline index_t unravel_ravel(const index_t idx, const index_t (&shape1)[MAX_DIM],
+                                        const index_t (&shape2)[MAX_DIM]) {
+  index_t ret = 0;
+  index_t total_shape = 1;
+#pragma unroll
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    if (i != ndim - 1) {
+      total_shape *= shape2[i + 1];
+    }
+    auto tmp = j / shape1[i];
+    const index_t coord = j - tmp*shape1[i];
+    ret += total_shape * (shape2[i] > coord) * coord;
+    j = tmp;
+  }
+  return ret;
+}
+
 }  // namespace util
 )code";
 }  // namespace rtc
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 15fc1d879d7d..7303ae47ac8d 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -258,19 +258,6 @@ __global__ void reduce_kernel_M1(const int N, const bool addto,
   }
 }
 
-// Returns the stride with which the fastest dimension is moving.
-// Used to detect memory access scatter.
-template<int ndim>
-MSHADOW_XINLINE int fastest_stride(const Shape<ndim>& small, const Shape<ndim>& big,
-  const Shape<ndim>& big_stride) {
-  for (int i = ndim-1; i >= 0; --i) {
-    if (big[i] != 1) {
-      return (small[i] == big[i]) ? 1 : big_stride[i];
-    }
-  }
-  return 1;
-}
-
 // Returns a/b integer division rounded up
 template<typename Type>
 Type ceil_idiv(const Type a, const Type b) {
@@ -354,20 +341,19 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const mxnet::TShape& small,
   } else {
 
     int reduce_strides[3];
-    reduce_strides[0] = fastest_stride(small.get<ndim>(), big.get<ndim>(),
-      big.get<ndim>());
-    reduce_strides[1] = (multiOp) ? fastest_stride(small.get<ndim>(),
-      lhs->get<ndim>(), lhs->get<ndim>()) : 1;
-    reduce_strides[2] = (multiOp) ? fastest_stride(small.get<ndim>(),
-      rhs->get<ndim>(), rhs->get<ndim>()) : 1;
+    reduce_strides[0] = fastest_stride(small, big, big);
+    reduce_strides[1] = (multiOp) ? fastest_stride(small, *lhs, *lhs) : 1;
+    reduce_strides[2] = (multiOp) ? fastest_stride(small, *rhs, *rhs) : 1;
 
     int reduce_strides_transp[3];
-    reduce_strides_transp[0] = fastest_stride(small.get<ndim>(), config.rshape,
-      config.rstride);
+    reduce_strides_transp[0] = fastest_stride(small, TShape(config.rshape),
+      TShape(config.rstride));
     reduce_strides_transp[1] = (multiOp) ?
-      fastest_stride(small.get<ndim>(), config.lhs_shape, config.lhs_stride) : 1;
+      fastest_stride(small, TShape(config.lhs_shape),
+                     TShape(config.lhs_stride)) : 1;
     reduce_strides_transp[2] = (multiOp) ?
-      fastest_stride(small.get<ndim>(), config.rhs_shape, config.rhs_stride) : 1;
+      fastest_stride(small, TShape(config.rhs_shape),
+                     TShape(config.rhs_stride)) : 1;
 
     uint64_t num_load = calc_num_load(config.N, config.M, reduce_strides);
     uint64_t num_load_transp = calc_num_load(config.M, config.N, reduce_strides_transp);
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index dd1505d9357c..b523f085726f 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -287,6 +287,23 @@ MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const
   assign(&small[idx], addto, OType(val));
 }
 
+namespace {
+
+// Returns the stride with which the fastest dimension is moving.
+// Used to detect memory access scatter.
+inline int fastest_stride(const TShape &small, const TShape &big,
+                                   const TShape &big_stride) {
+  const int ndim = small.ndim();
+  for (int i = ndim-1; i >= 0; --i) {
+    if (big[i] != 1) {
+      return (small[i] == big[i]) ? 1 : big_stride[i];
+    }
+  }
+  return 1;
+}
+
+}  // namespace
+
 #ifdef __CUDACC__
 #include "broadcast_reduce-inl.cuh"
 #endif
@@ -478,6 +495,20 @@ void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
     lhs.shape_.get<ndim>(), rhs.shape_.get<ndim>());
 }
 
+#if MXNET_USE_CUDA
+
+void RTCReduce(const NodeAttrs& attrs,
+               const OpContext& ctx,
+               const TBlob& small,
+               const OpReqType req,
+               const Tensor<gpu, 1, char>& workspace,
+               const TBlob& big,
+               const std::string& reducer,
+               int ndim,
+               const std::string& OP);
+
+#endif
+
 }  // namespace broadcast
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index 7f8b7a55ffe7..4647acf9d994 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -19,6 +19,10 @@
 
 #include <string>
 
+#if MXNET_USE_CUDA
+#include <cuda_runtime.h>
+#endif  // MXNET_USE_CUDA
+
 #include "broadcast_reduce-inl.h"
 #include "elemwise_binary_broadcast_op.h"
 
@@ -342,6 +346,62 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
   }
 }
 
+void BinaryBroadcastRTCBackwardUseNone::operator()(const nnvm::NodeAttrs& attrs,
+                                                   const OpContext& ctx,
+                                                   const std::vector<TBlob>& inputs,
+                                                   const std::vector<OpReqType>& req,
+                                                   const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U);
+  mxnet::TShape new_lshape, new_rshape, new_oshape;
+  int ndim = BinaryBroadcastShapeCompact(outputs[0].shape_, outputs[1].shape_, inputs[0].shape_,
+                                         &new_lshape, &new_rshape, &new_oshape);
+  if (!ndim) {
+    ElemwiseBinaryRTCBwdUseNone {LOP, ROP}(attrs, ctx, inputs, req, outputs);
+  } else {
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    const TBlob lhs = outputs[0].reshape(new_lshape);
+    const TBlob rhs = outputs[1].reshape(new_rshape);
+    const TBlob out = inputs[0].reshape(new_oshape);
+    BROADCAST_NDIM_SWITCH(ndim, NDim, {
+      // Request temporary storage
+      size_t workspace_size = new_oshape.Size();
+      Tensor<gpu, 1, char> workspace =
+          ctx.requested[0].get_space_typed<gpu, 1, char>(
+              Shape1(workspace_size * sizeof(index_t)), s);
+      if (out.shape_.Size() != 0) {
+        broadcast::RTCReduce(attrs, ctx, lhs, req[0],
+                             workspace, out,
+                             "red::sum", NDim, LOP);
+        broadcast::RTCReduce(attrs, ctx, rhs, req[1],
+                             workspace, out,
+                             "red::sum", NDim, ROP);
+      } else {
+        using namespace common::cuda::rtc::util;
+        if (lhs.shape_.Size() != 0) {
+          cudaMemsetAsync(lhs.dptr_, 0,
+                          lhs.shape_.Size() * mshadow_type_info(lhs.type_flag_).size,
+                          Stream<gpu>::GetStream(s));
+        }
+        if (rhs.shape_.Size() != 0) {
+          cudaMemsetAsync(rhs.dptr_, 0,
+                          rhs.shape_.Size() * mshadow_type_info(rhs.type_flag_).size,
+                          Stream<gpu>::GetStream(s));
+        }
+      }
+    });
+  }
+}
+
+void BinaryBroadcastRTCBackwardUseIn::operator()(const nnvm::NodeAttrs& attrs,
+                                                 const OpContext& ctx,
+                                                 const std::vector<TBlob>& inputs,
+                                                 const std::vector<OpReqType>& req,
+                                                 const std::vector<TBlob>& outputs) {
+  LOG(FATAL) << "Not implemented yet!";
+}
+
 #endif  // MXNET_USE_CUDA
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index b2be0dfadd6d..8c44a1a79ac9 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -259,6 +259,28 @@ struct BinaryBroadcastRTCCompute {
                   const std::vector<TBlob>& outputs);
 };
 
+struct BinaryBroadcastRTCBackwardUseNone {
+  std::string LOP;
+  std::string ROP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
+struct BinaryBroadcastRTCBackwardUseIn {
+  std::string LOP;
+  std::string ROP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
 #endif  // MXNET_USE_CUDA
 
 template<typename xpu, typename OP>
@@ -563,14 +585,6 @@ BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs,
   }
 }
 
-template<typename xpu, typename LOP, typename ROP>
-inline typename std::enable_if<std::is_same<xpu, gpu>::value, void>::type
-BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs,
-                               const OpContext& ctx,
-                               const std::vector<TBlob>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<TBlob>& outputs);
-
 template<typename xpu, int ndim, typename DType, typename LOP, typename ROP>
 void BinaryBroadcastBackwardUseInImplWithWorkspace(const OpContext& ctx,
                                                    const std::vector<TBlob>& inputs,
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
index e9a75c509fd4..07fbb8363530 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
@@ -33,16 +33,14 @@ NNVM_REGISTER_OP(broadcast_add)
 .set_attr<FComputeEx>("FComputeEx<gpu>", BinaryBroadcastComputeDenseEx<gpu, op::mshadow_op::plus>);
 
 NNVM_REGISTER_OP(_backward_broadcast_add)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseNone<gpu, mshadow_op::identity,
-                                                                mshadow_op::identity>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseNone{"identity", "identity"});
 
 NNVM_REGISTER_OP(broadcast_sub)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"sub"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", BinaryBroadcastComputeDenseEx<gpu, op::mshadow_op::minus>);
 
 NNVM_REGISTER_OP(_backward_broadcast_sub)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseNone<gpu, mshadow_op::identity,
-                                                                mshadow_op::negation>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseNone{"identity", "negation"});
 
 NNVM_REGISTER_OP(broadcast_mul)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mul"})
diff --git a/src/operator/tensor/reduce_rtc.cc b/src/operator/tensor/reduce_rtc.cc
new file mode 100644
index 000000000000..7db469e28969
--- /dev/null
+++ b/src/operator/tensor/reduce_rtc.cc
@@ -0,0 +1,500 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "broadcast_reduce-inl.h"
+#include "elemwise_unary_op.h"
+
+#if MXNET_USE_CUDA
+#include "../../common/cuda/rtc.h"
+#endif  // MXNET_USE_CUDA
+
+using namespace mshadow;
+
+namespace mxnet {
+namespace op {
+namespace broadcast {
+
+#if MXNET_USE_CUDA
+
+namespace {
+
+constexpr int nthread_reduce = 512;
+constexpr int kBaseGridNum = 1024;
+
+int diff(const TShape& small, const TShape& big, TShape* dims,
+  TShape* stride) {
+  int ndim = small.ndim();
+  int mdim = 0;
+  #pragma unroll
+  for (int i = 0; i < ndim; ++i) {
+    mdim += small[i] != big[i];
+    (*dims)[i] = (*stride)[i] = 1;
+  }
+
+  index_t s = 1;
+  #pragma unroll
+  for (int i = ndim - 1, j = mdim; i >= 0; --i) {
+    if (small[i] != big[i]) {
+      --j;
+      (*stride)[j] = s;
+      (*dims)[j] = big[i];
+    }
+    s *= big[i];
+  }
+  return mdim;
+}
+
+constexpr int warpSize = 32;
+constexpr int unroll_reduce = 2;
+constexpr int maxLoopPerTB = 64;
+
+// Returns a/b integer division rounded up
+template<typename Type>
+Type ceil_idiv(const Type a, const Type b) {
+  return (a + b - 1)/b;
+}
+
+uint64_t calc_num_load(const int X, const int Y, const int* strides) {
+  // Number of full warps
+  uint64_t num_full_warp = X / warpSize;
+  // Length of the partial warp i.e. number of threads that are performing loads
+  uint64_t len_part_warp = X % warpSize;
+
+  uint64_t num_load_full = (std::min(warpSize, strides[0]) +
+    std::min(warpSize, strides[1]) +
+    std::min(warpSize, strides[2]))*num_full_warp;
+
+  uint64_t num_load_part =
+  (std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[0], warpSize)) +
+    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[1], warpSize)) +
+    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[2], warpSize)))*
+  (len_part_warp != 0);
+
+  uint64_t num_load = (num_load_full + num_load_part)*(uint64_t)Y;
+  return num_load;
+}
+
+struct RTCReduceImplConfig {
+  index_t N;
+  index_t M;
+  index_t Mnext;
+  struct {
+    dim3 blockDim;
+    dim3 gridDim;
+    int shMemSize;
+    bool do_transpose;
+  } kernel_1;
+  struct {
+    int blockSize;
+    int gridSize;
+  } kernel_2;
+  size_t workspace_size;
+
+  TShape rshape, rstride;
+  TShape lhs_shape, lhs_stride;
+  TShape rhs_shape, rhs_stride;
+
+  RTCReduceImplConfig(const ::mxnet::TShape& small, const ::mxnet::TShape& big,
+                      const size_t type_size, const ::mxnet::TShape* lhs,
+                      const ::mxnet::TShape* rhs) :
+    rshape(small.ndim(), 1), rstride(small.ndim(), 1),
+    lhs_shape(small.ndim(), 1), lhs_stride(small.ndim(), 1),
+    rhs_shape(small.ndim(), 1), rhs_stride(small.ndim(), 1) {
+    int ndim = small.ndim();
+
+    diff(small, big, &rshape, &rstride);
+    N = small.Size();
+
+    M = rshape[0];
+    for (int i = 1; i < ndim; ++i) {
+      M *= rshape[i];
+    }
+
+    bool multiOp = false;
+    if (lhs != nullptr) {
+      CHECK_NOTNULL(rhs);
+      diff(small, *lhs, &lhs_shape, &lhs_stride);
+      diff(small, *rhs, &rhs_shape, &rhs_stride);
+      multiOp = true;
+    }
+
+    workspace_size = 0;
+
+    if (M == 1) {
+      kernel_1.blockDim.x = nthread_reduce;
+      kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+          (N + kernel_1.blockDim.x - 1)/kernel_1.blockDim.x);
+    } else {
+
+      int reduce_strides[3];
+      reduce_strides[0] = fastest_stride(small, big, big);
+      reduce_strides[1] = (multiOp) ? fastest_stride(small, *lhs, *lhs) : 1;
+      reduce_strides[2] = (multiOp) ? fastest_stride(small, *rhs, *rhs) : 1;
+
+      int reduce_strides_transp[3];
+      reduce_strides_transp[0] = fastest_stride(small, rshape, rstride);
+      reduce_strides_transp[1] = (multiOp) ?
+        fastest_stride(small, lhs_shape, lhs_stride) : 1;
+      reduce_strides_transp[2] = (multiOp) ?
+        fastest_stride(small, rhs_shape, rhs_stride) : 1;
+
+      uint64_t num_load = calc_num_load(N, M, reduce_strides);
+      uint64_t num_load_transp = calc_num_load(M, N, reduce_strides_transp);
+
+      Mnext = 1;
+      kernel_1.do_transpose = (num_load > num_load_transp);
+
+      kernel_1.blockDim.x = 0;
+      kernel_1.blockDim.y = 0;
+
+      if (kernel_1.do_transpose) {
+        // Fastest thread ID goes through M
+        // Loop over N has step size kernel_1.blockDim.y
+        if (N < 8) {
+          kernel_1.blockDim.y = 1;
+        } else if (N < 256) {
+          kernel_1.blockDim.y = 4;
+        } else {
+          if (M < 8) {
+            kernel_1.blockDim.x = 1;
+          } else if (M < 256) {
+            kernel_1.blockDim.x = 4;
+          } else {
+            kernel_1.blockDim.x = warpSize;
+          }
+        }
+      } else {
+        // Fastest thread ID goes through N
+        // Loop over M has step size kernel_1.blockDim.y
+        if (M < 8) {
+          kernel_1.blockDim.y = 1;
+        } else if (M < 256) {
+          kernel_1.blockDim.y = 4;
+        } else {
+          if (N < 8) {
+            kernel_1.blockDim.x = 1;
+          } else if (N < 256) {
+            kernel_1.blockDim.x = 4;
+          } else {
+            kernel_1.blockDim.x = warpSize;
+          }
+        }
+      }
+
+      if (kernel_1.blockDim.x == 0 && kernel_1.blockDim.y == 0) {
+        LOG(FATAL) << "Unable to set blockDim";
+      } else if (kernel_1.blockDim.x == 0) {
+        kernel_1.blockDim.x = nthread_reduce / kernel_1.blockDim.y;
+      } else if (kernel_1.blockDim.y == 0) {
+        kernel_1.blockDim.y = nthread_reduce / kernel_1.blockDim.x;
+      }
+
+      if (kernel_1.do_transpose) {
+        // Fastest thread ID goes through M
+        kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+            ceil_idiv<unsigned int>(N, kernel_1.blockDim.y));
+        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
+        int by = kernel_1.blockDim.y;
+        if (kernel_1.blockDim.y % warpSize == 0) {
+          // Fix shared memory bank conflict
+          by++;
+        }
+        kernel_1.shMemSize = (kernel_1.blockDim.x > 1) ?
+          kernel_1.blockDim.x*by*type_size * 2 : 0;
+        // Maximum number of times we want TB to loop in M
+        // Max size of M-block each TB can handle
+        int maxMblock = kernel_1.blockDim.x*maxLoopPerTB;
+        Mnext = (M + maxMblock - 1) / maxMblock;
+      } else {
+        // Fastest thread ID goes through N
+        kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+            ceil_idiv<unsigned int>(N, kernel_1.blockDim.x));
+        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
+        kernel_1.shMemSize = (kernel_1.blockDim.y > 1) ?
+          kernel_1.blockDim.x*kernel_1.blockDim.y*type_size * 2 : 0;
+        // Maximum number of times we want TB to loop in M
+        // Max size of M-block each TB can handle
+        int maxMblock = kernel_1.blockDim.y*maxLoopPerTB;
+        Mnext = (M + maxMblock - 1) / maxMblock;
+      }
+
+      if (Mnext > 1) {
+        // small_dptr[] is N*Mnext*type_size bytes
+        workspace_size += N*Mnext*sizeof(double);
+        // Set gridDim.y to Mnext
+        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
+      }
+
+      if (Mnext > 1) {
+        kernel_2.blockSize = nthread_reduce;
+        kernel_2.gridSize = std::min((int)kBaseGridNum,
+            (N + kernel_2.blockSize - 1)/kernel_2.blockSize );
+      }
+
+    }
+  }
+
+};
+
+struct reduce_kernel_params {
+  index_t big_shape[MAX_DIM];
+  index_t small_shape[MAX_DIM];
+  index_t rshape[MAX_DIM];
+  index_t rstride[MAX_DIM];
+};
+
+const char reduce_kernel_code[] = R"code(
+struct reduce_kernel_params {
+  index_t big_shape[util::MAX_DIM];
+  index_t small_shape[util::MAX_DIM];
+  index_t rshape[util::MAX_DIM];
+  index_t rstride[util::MAX_DIM];
+};
+
+__global__ void reduce_kernel(const int N, const int M, const bool addto,
+                              const InputType0* __restrict big,
+                              OutputType0 *small,
+                              const reduce_kernel_params params,
+                              const int Mnext) {
+  extern __shared__ char shTileChar[];
+  using IType = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
+  using AType = typename IType::type;
+  AType* shTile = (AType*)(shTileChar);
+  const int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  const int bx = (do_transpose) ? blockDim.y : blockDim.x;
+  const int by = (do_transpose) ? blockDim.x : blockDim.y;
+  const int tidx = (do_transpose) ? tid / by : threadIdx.x;
+  const int tidy = (do_transpose) ? tid % by : threadIdx.y;
+  for (int m0 = blockIdx.y; m0 < Mnext; m0 += gridDim.y) {
+    // This TB handles M range [Mstart, ...., Mend - 1]
+    const index_t Mstart = (index_t)((int64)M*(int64)m0/(int64)Mnext);
+    const index_t Mend   = (index_t)((int64)M*(int64)(m0 + 1)/(int64)Mnext);
+    for (index_t idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) {
+      int idx = idx0 + tidx;
+      index_t idx_big0 = util::unravel_ravel<ndim>(idx, params.small_shape, params.big_shape);
+
+      AType val, residual;
+      REDUCER::SetInitValue(val, residual);
+      if (idx < N) {
+        for (index_t k = tidy + Mstart; k < Mend; k += by*UNROLL) {
+          index_t idx_big[UNROLL];
+          #pragma unroll
+          for (int u=0;u < UNROLL;u++) {
+            idx_big[u] = idx_big0 + util::unravel_dot<ndim>(k + u*by, params.rshape,
+                                                            params.rstride);
+          }
+          typename OType::type tmp[UNROLL];
+          #pragma unroll
+          for (int u=0;u < UNROLL;u++) {
+            if (k + u*by < Mend) {
+              tmp[u] = OP(OType::from(big[idx_big[u]]));
+            }
+          }
+          #pragma unroll
+          for (int u=0;u < UNROLL;u++) {
+            if (k + u*by < Mend) REDUCER::Reduce(val, tmp[u], residual);
+          }
+        }
+      }
+
+      // Shared memory block bx * by. Reduction is along by. Final result is in tidy=0
+      if (by > 1) {
+        // Fix bx to avoid bank conflicts. Assumes warpSize number of banks
+        const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx + 1) : bx;
+        const int it0 = tidx + tidy*fbx;
+        shTile[it0 * 2] = val;
+        shTile[it0 * 2 + 1] = residual;
+        __syncthreads();
+        for (int t=1;t < by;t <<= 1) {
+          AType tmp, tmp_residual;
+          REDUCER::SetInitValue(tmp, tmp_residual);
+          if (tidy + t < by) {
+            tmp = shTile[(it0 + t*fbx) * 2];
+            tmp_residual = shTile[(it0 + t*fbx) * 2 + 1];
+          }
+          __syncthreads();
+          REDUCER::Merge(shTile[it0 * 2], shTile[it0 * 2 + 1], tmp, tmp_residual);
+          __syncthreads();
+        }
+        if (idx < N && tidy == 0) {
+          if (addto) {
+            small[idx + m0 * N] = OType::to(op::add(OType::from(small[idx + m0 * N]),
+                                                    shTile[tidx * 2]));
+          } else {
+            small[idx + m0 * N] = OType::to(shTile[tidx * 2]);
+          }
+        }
+      } else {
+        if (idx < N) {
+          if (addto) {
+            small[idx + m0 * N] = OType::to(op::add(OType::from(small[idx + m0 * N]),
+                                                    val));
+          } else {
+            small[idx + m0 * N] = OType::to(val);
+          }
+        }
+      }
+    }
+  }
+}
+)code";
+
+const char reduce_lines_kernel_code[] = R"code(
+__global__ void reduce_lines_kernel(const index_t N, const index_t M,
+                                    const index_t small_in_stride,
+                                    const OutputType0* __restrict small_in,
+                                    OutputType0 *small_out) {
+  using OType = AccType<OutputType0>;
+  for (index_t idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
+    typename OType::type val, residual;
+    REDUCER::SetInitValue(val, residual);
+    for (int k = 0; k < M; k++) {
+      REDUCER::Reduce(val,
+        OType::from(reinterpret_cast<const OutputType0*>(small_in)[idx + k*small_in_stride]),
+        residual);
+    }
+
+    if (idx < N) {
+      if (req == OpReqType::kAddTo) {
+        small_out[idx] = OType::to(op::add(OType::from(small_out[idx]), val));
+      } else {
+        small_out[idx] = OType::to(val);
+      }
+    }
+
+  }
+}
+)code";
+
+void RTCReduceImpl(Stream<gpu> *s, const TBlob& small, const bool addto,
+                const TBlob& big, const Tensor<gpu, 1, char>& workspace,
+                const RTCReduceImplConfig& config, const int ndim,
+                const std::string &common_code, int dev_id) {
+  using namespace common::cuda::rtc;
+  void* small_dptr = small.dptr_;
+  bool first_kernel_addto = addto;
+  if (config.Mnext > 1) {
+    // small_dptr[] is N*Mnext*sizeof(DType) bytes
+    small_dptr = workspace.dptr_;
+    first_kernel_addto = false;
+    // Check that the workspace is contigiuous
+    CHECK_EQ(workspace.CheckContiguous(), true);
+    // Check that we have enough storage
+    CHECK_GE(workspace.size(0), config.workspace_size);
+  }
+
+  const int by = (config.kernel_1.do_transpose) ?
+    config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
+  const bool do_unroll = ( config.M / (by*config.Mnext) >= unroll_reduce );
+  std::string code = common_code +
+                     "#define UNROLL " +
+                     (do_unroll ? std::to_string(unroll_reduce) : "1") +
+                     "\n"
+                     "const bool do_transpose = " +
+                     (config.kernel_1.do_transpose ? "true" : "false") +
+                     ";\n"
+                     "using InputType0 = " +
+                     util::mshadow_type_info(big.type_flag_).name +
+                     ";\n"
+                     "using OutputType0 = " +
+                     util::mshadow_type_info(small.type_flag_).name +
+                     ";\n";
+
+  reduce_kernel_params param {};
+  for (int i = 0; i < ndim; ++i) {
+    param.big_shape[i] = big.shape_[i];
+    param.small_shape[i] = small.shape_[i];
+    param.rshape[i] = config.rshape[i];
+    param.rstride[i] = config.rstride[i];
+  }
+
+  std::vector<const void*> args;
+  args.emplace_back(&config.N);
+  args.emplace_back(&config.M);
+  args.emplace_back(&first_kernel_addto);
+  args.emplace_back(&big.dptr_);
+  args.emplace_back(&small_dptr);
+  args.emplace_back(&param);
+  args.emplace_back(&config.Mnext);
+
+  auto reduce_kernel_func = get_function(code + reduce_kernel_code, "reduce_kernel", dev_id);
+  launch(reduce_kernel_func, config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, s, &args);
+
+  if (config.Mnext > 1) {
+    args.resize(0);
+    args.emplace_back(&config.N);
+    args.emplace_back(&config.Mnext);
+    args.emplace_back(&config.N);
+    args.emplace_back(&small_dptr);
+    args.emplace_back(&small.dptr_);
+
+    auto reduce_lines_kernel_func = get_function(code + reduce_lines_kernel_code,
+                                                 "reduce_lines_kernel", dev_id);
+    launch(reduce_lines_kernel_func, config.kernel_2.gridSize,
+           config.kernel_2.blockSize, 0, s, &args);
+  }
+}
+
+
+}  // namespace
+
+void RTCReduce(const NodeAttrs& attrs,
+               const OpContext& ctx,
+               const TBlob& small,
+               const OpReqType req,
+               const Tensor<gpu, 1, char>& workspace,
+               const TBlob& big,
+               const std::string& reducer,
+               int ndim,
+               const std::string& OP) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req == kNullOp) return;
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  size_t type_size = util::mshadow_type_info(small.type_flag_).size;
+  if (small.type_flag_ == mshadow::kFloat16) {
+    type_size = sizeof(float);
+  }
+  RTCReduceImplConfig config(small.shape_, big.shape_, type_size, nullptr, nullptr);
+  if (config.M == 1) {
+    // With M == 1 result is just (possibly reshaped) OP(big)
+    UnaryRTCCompute {OP} (attrs, ctx, {big}, {req}, {small});
+  } else {
+    std::string common_code = std::string("const OpReqType req = ") +
+                              util::to_string(req) +
+                              ";\n"
+                              "#define OP op::" +
+                              OP +
+                              "\n"
+                              "#define REDUCER " +
+                              reducer +
+                              "\n"
+                              "const int ndim = " +
+                              std::to_string(ndim) +
+                              ";\n";
+    RTCReduceImpl(s, small, req == kAddTo, big, workspace, config,
+                  ndim, common_code, ctx.run_ctx.ctx.dev_id);
+  }
+}
+
+#endif  // MXNET_USE_CUDA
+
+}  // namespace broadcast
+}  // namespace op
+}  // namespace mxnet

From d6074e9853e775395fb4f4370c1be85015c39a81 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 18 Jun 2020 12:46:55 -0700
Subject: [PATCH 14/64] RTC for broadcast bwd usein

---
 src/common/cuda/rtc.cc                        |  25 -
 src/common/cuda/rtc.h                         |  10 -
 src/common/cuda/rtc/backward_functions-inl.h  |  26 +-
 src/common/cuda/rtc/reducer-inl.h             |   4 +-
 src/common/cuda/rtc/util-inl.h                |  53 +-
 src/common/cuda/rtc/vectorization-inl.h       |  11 +-
 src/common/utils.cc                           |  25 +
 src/common/utils.h                            |  10 +
 src/operator/leaky_relu-inl.h                 |  46 +-
 src/operator/nn/group_norm-inl.h              |  13 +-
 src/operator/nn/layer_norm-inl.h              |  13 +-
 src/operator/numpy/np_broadcast_reduce_op.h   |   4 +-
 .../numpy/np_elemwise_broadcast_op.cu         |   3 +-
 src/operator/numpy/np_elemwise_broadcast_op.h |   8 +-
 .../np_elemwise_broadcast_op_extended.cu      |  17 +-
 .../quantization/quantization_utils.h         |   2 +-
 src/operator/random/pdf_op.h                  |   4 +-
 src/operator/tensor/broadcast_reduce-inl.cuh  | 242 +---------
 src/operator/tensor/broadcast_reduce-inl.h    | 271 ++++++++++-
 src/operator/tensor/broadcast_reduce_op.h     |   4 +-
 .../elemwise_binary_broadcast_op-inl.cuh      |  82 ----
 .../tensor/elemwise_binary_broadcast_op.cc    |  49 +-
 .../tensor/elemwise_binary_broadcast_op.h     |  11 +-
 .../elemwise_binary_broadcast_op_basic.cu     |   9 +-
 .../elemwise_binary_broadcast_op_extended.cu  |  13 +-
 src/operator/tensor/elemwise_binary_op.cc     |   2 +-
 src/operator/tensor/elemwise_sum.cu           |   2 +-
 src/operator/tensor/reduce_rtc.cc             | 451 +++++++++---------
 28 files changed, 744 insertions(+), 666 deletions(-)
 delete mode 100644 src/operator/tensor/elemwise_binary_broadcast_op-inl.cuh

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 46b26229a0ce..5a0ee562f7cc 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -62,31 +62,6 @@ std::string to_string(OpReqType req) {
   LOG(FATAL) << "Unrecognized req.";
 }
 
-TypeInfo mshadow_type_info(int type_flag) {
-  using namespace mshadow;
-  switch (type_flag) {
-    case kFloat32:
-      return TypeInfo("float32", sizeof(float));
-    case kFloat64:
-      return TypeInfo("float64", sizeof(double));
-    case kFloat16:
-      return TypeInfo("float16", 2);
-    case kUint8:
-      return TypeInfo("uint8", sizeof(uint8_t));
-    case kInt32:
-      return TypeInfo("int32", sizeof(int32_t));
-    case kInt8:
-      return TypeInfo("int8", sizeof(int8_t));
-    case kInt64:
-      return TypeInfo("int64", sizeof(int64_t));
-    case kBool:
-      return TypeInfo("bool", sizeof(bool));
-    default:
-      LOG(FATAL) << "Unknown type flag " << type_flag;
-      return TypeInfo("INVALID", 1);
-  }
-}
-
 }  // namespace util
 
 namespace {
diff --git a/src/common/cuda/rtc.h b/src/common/cuda/rtc.h
index 057ba5c79896..b458691b63a7 100644
--- a/src/common/cuda/rtc.h
+++ b/src/common/cuda/rtc.h
@@ -48,16 +48,6 @@ namespace util {
 
 std::string to_string(OpReqType req);
 
-struct TypeInfo {
-  std::string name;
-  int size;
-
-  TypeInfo(const std::string name, const int size) :
-    name(std::move(name)), size(size) {}
-};
-
-TypeInfo mshadow_type_info(int type_flag);
-
 }  // namespace util
 
 extern std::mutex lock;
diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
index 4d88e2639f38..e7d7b7fd14a8 100644
--- a/src/common/cuda/rtc/backward_functions-inl.h
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -298,7 +298,13 @@ __device__ inline DType hypot_grad_right(const DType val,
 template <typename DType, typename DType2>
 __device__ inline DType copysign_grad(const DType val,
                                       const DType2 val2) {
-  return (a >= 0 && b >= 0) || (a < 0 && b < 0) ? 1 : -1;
+  return (val >= 0 && val2 >= 0) || (val < 0 && val2 < 0) ? 1 : -1;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType zero_grad(const DType val,
+                                  const DType2 val2) {
+  return 0;
 }
 
 template <typename DType, typename DType2>
@@ -313,6 +319,12 @@ __device__ inline DType rarctan2_grad(const DType val,
   return val / (val * val + val2 * val2);
 }
 
+template <typename DType, typename DType2>
+__device__ inline DType arctan2_rgrad(const DType val,
+                                      const DType2 val2) {
+  return -rarctan2_grad(val, val2);
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType ldexp_grad(const DType val,
                                    const DType2 val2) {
@@ -387,6 +399,18 @@ __device__ inline DType smooth_l1_grad(const DType val, const DType2 scalar) {
   }
 }
 
+template <typename DType, typename DType2>
+__device__ inline DType xelu_grad(const DType val,
+                                  const DType2 val2) {
+  return (val > 0) ? 1 : val2;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType prelu_grad(const DType val,
+                                   const DType2 val2) {
+  return (val > 0) ? 0 : val;
+}
+
 }  // namespace op
 
 )code";
diff --git a/src/common/cuda/rtc/reducer-inl.h b/src/common/cuda/rtc/reducer-inl.h
index b57a2fcbaee2..cb9e9a35c9b0 100644
--- a/src/common/cuda/rtc/reducer-inl.h
+++ b/src/common/cuda/rtc/reducer-inl.h
@@ -45,7 +45,7 @@ struct sum {
                                        volatile DType& residual) {
     DType y = op::sub(src, residual);
     DType t = dst + y;
-    if (isinf(t)) {
+    if (util::isinf(t)) {
       residual = 0;
     } else {
       residual = (t - dst) - y;
@@ -62,7 +62,7 @@ struct sum {
   __device__ inline static void Merge(volatile DType& dst_val, volatile DType& dst_residual,
                                       volatile DType& src_val, volatile DType& src_residual) {
     DType t1 = dst_val + src_val;
-    if (isinf(t1)) {
+    if (util::isinf(t1)) {
       dst_val = t1;
       dst_residual = 0;
     } else {
diff --git a/src/common/cuda/rtc/util-inl.h b/src/common/cuda/rtc/util-inl.h
index 02c7d83acf69..4fe93a72cf43 100644
--- a/src/common/cuda/rtc/util-inl.h
+++ b/src/common/cuda/rtc/util-inl.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef MXNET_COMMON_CUDA_RTC_TYPE_INL_H_
-#define MXNET_COMMON_CUDA_RTC_TYPE_INL_H_
+#ifndef MXNET_COMMON_CUDA_RTC_UTIL_INL_H_
+#define MXNET_COMMON_CUDA_RTC_UTIL_INL_H_
 
 #include <mxnet/base.h>
 
@@ -208,6 +208,53 @@ __device__ inline index_t unravel_ravel(const index_t idx, const index_t (&shape
   return ret;
 }
 
+template<int ndim, int ndim2>
+__device__ inline index_t ravel(const index_t (&coord)[ndim], const index_t (&shape)[ndim2]) {
+  index_t ret = 0;
+#pragma unroll
+  for (int i = 0; i < ndim; ++i) {
+    ret = ret * shape[i] + (shape[i] > coord[i]) * coord[i];
+  }
+  return ret;
+}
+
+template<int ndim, int ndim2>
+__device__ inline void unravel(const index_t idx,
+                               const index_t (&shape)[ndim2],
+                               index_t (&coord)[ndim]) {
+#pragma unroll
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
+    coord[i] = j - tmp*shape[i];
+    j = tmp;
+  }
+}
+
+template <typename DType>
+__device__ inline bool isinf(volatile const DType &val) {
+  return false;
+}
+
+template <>
+__device__ inline bool isinf(volatile const float &val) {
+  return ::isinf(val);
+}
+
+template <>
+__device__ inline bool isinf(volatile const double &val) {
+  return ::isinf(val);
+}
+
+template <>
+__device__ inline bool isinf(volatile const long double &val) {
+  return ::isinf(val);
+}
+
+template <>
+__device__ inline bool isinf(volatile const float16 &val) {
+  return ::isinf(__half2float(const_cast<const float16&>(val)));
+}
+
 }  // namespace util
 )code";
 }  // namespace rtc
@@ -217,4 +264,4 @@ __device__ inline index_t unravel_ravel(const index_t idx, const index_t (&shape
 
 #endif  // MXNET_USE_CUDA
 
-#endif  // MXNET_COMMON_CUDA_RTC_TYPE_INL_H_
+#endif  // MXNET_COMMON_CUDA_RTC_UTIL_INL_H_
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index debe5b417015..a5c8c48426c4 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -30,6 +30,7 @@
 #include <algorithm>
 
 #include "../rtc.h"
+#include "../../utils.h"
 
 namespace mxnet {
 namespace common {
@@ -261,7 +262,7 @@ Alignment CheckAlignment(const Params& params, const index_t lead_dim,
                          const index_t other_dim, const int nvec,
                          const std::vector<TBlob> &inputs,
                          const std::vector<TBlob> &outputs) {
-  using namespace util;
+  using namespace common;
   int align = -1;
 
   size_t i = 0;
@@ -325,7 +326,6 @@ void VectorizedKernelRTCLauncher(const std::string &code,
                                  const std::vector<TBlob> &outputs,
                                  const int dev_id,
                                  const int lead_input_num = 0) {
-  using namespace util;
   const index_t N = lead_dim * other_dim;
   nvec = std::min(nvec, 4);  // Use at most 4-wide vectors
   if (N != 0) {
@@ -336,7 +336,7 @@ void VectorizedKernelRTCLauncher(const std::string &code,
     // Fill input types
     int counter = 0;
     for (const auto& input : inputs) {
-      const auto& type_info = mshadow_type_info(input.type_flag_);
+      const auto& type_info = common::mshadow_type_info(input.type_flag_);
       kernel_builder << "using InputType"
                      << counter
                      << " = "
@@ -349,7 +349,7 @@ void VectorizedKernelRTCLauncher(const std::string &code,
     // Fill output types
     counter = 0;
     for (const auto& output : outputs) {
-      const auto& type_info = mshadow_type_info(output.type_flag_);
+      const auto& type_info = common::mshadow_type_info(output.type_flag_);
       kernel_builder << "using OutputType"
                      << counter
                      << " = "
@@ -392,7 +392,8 @@ void VectorizedKernelRTCLauncher(const std::string &code,
     index_t num_aligned_elements = get_num_aligned_elements(
                                     params.inputs[lead_input_num],
                                     lead_dim, nvec,
-                                    mshadow_type_info(inputs[lead_input_num].type_flag_).size);
+                                    common::mshadow_type_info(
+                                      inputs[lead_input_num].type_flag_).size);
     size_t num_elements = other_dim * num_aligned_elements;
     constexpr int threads = vectorized_kernel_thread_num;
     constexpr int max_blocks = 65535;
diff --git a/src/common/utils.cc b/src/common/utils.cc
index 032a324c96b0..df5703b39476 100644
--- a/src/common/utils.cc
+++ b/src/common/utils.cc
@@ -108,5 +108,30 @@ void ExecuteMonOutputCallback(
   }
 }
 
+MShadowTypeInfo mshadow_type_info(const int type_flag) {
+  using namespace mshadow;
+  switch (type_flag) {
+    case kFloat32:
+      return MShadowTypeInfo("float32", sizeof(float));
+    case kFloat64:
+      return MShadowTypeInfo("float64", sizeof(double));
+    case kFloat16:
+      return MShadowTypeInfo("float16", 2);
+    case kUint8:
+      return MShadowTypeInfo("uint8", sizeof(uint8_t));
+    case kInt32:
+      return MShadowTypeInfo("int32", sizeof(int32_t));
+    case kInt8:
+      return MShadowTypeInfo("int8", sizeof(int8_t));
+    case kInt64:
+      return MShadowTypeInfo("int64", sizeof(int64_t));
+    case kBool:
+      return MShadowTypeInfo("bool", sizeof(bool));
+    default:
+      LOG(FATAL) << "Unknown type flag " << type_flag;
+      return MShadowTypeInfo("INVALID", 1);
+  }
+}
+
 }  // namespace common
 }  // namespace mxnet
diff --git a/src/common/utils.h b/src/common/utils.h
index 9ea3329f2c24..178c4cdeb28a 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -950,6 +950,16 @@ inline int GetDefaultDtype(int dtype) {
          mshadow::kFloat32;
 }
 
+struct MShadowTypeInfo {
+  std::string name;
+  int size;
+
+  MShadowTypeInfo(const std::string name, const int size) :
+    name(std::move(name)), size(size) {}
+};
+
+MShadowTypeInfo mshadow_type_info(const int type_flag);
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index 3d81cfc0d967..ccb93080c65a 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -254,21 +254,39 @@ class LeakyReLUOp : public Operator {
                                                          &new_rshape,
                                                          &new_oshape) != 0;
         if (!need_bc) {
-          ElemwiseBinaryOp::BackwardUseIn<xpu,
-                                          mshadow_op::xelu_grad,
-                                          mshadow_op::prelu_grad>(
-            nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
-                                     in_data[leakyrelu::kData],
-                                     in_data[leakyrelu::kGamma]}, req, in_grad);
+          if constexpr (std::is_same<xpu, cpu>::value) {
+            ElemwiseBinaryOp::BackwardUseIn<xpu,
+                                            mshadow_op::xelu_grad,
+                                            mshadow_op::prelu_grad>(
+              nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
+                                       in_data[leakyrelu::kData],
+                                       in_data[leakyrelu::kGamma]}, req, in_grad);
+          } else {
+            ElemwiseBinaryRTCBwdUseIn {"xelu_grad", "prelu_grad"}(
+              nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
+                                       in_data[leakyrelu::kData],
+                                       in_data[leakyrelu::kGamma]}, req, in_grad);
+          }
         } else {
-          BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
-            BinaryBroadcastBackwardUseInImpl<xpu, NDim, DType,
-              mshadow_op::xelu_grad, mshadow_op::prelu_grad>(
-                ctx, {out_grad[leakyrelu::kOut],
-                      in_data[leakyrelu::kData],
-                      in_data[leakyrelu::kGamma]}, req, in_grad,
-                new_lshape, new_rshape, new_oshape);
-          });
+          if constexpr (std::is_same<xpu, cpu>::value) {
+            BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
+              BinaryBroadcastBackwardUseInImpl<xpu, NDim, DType,
+                mshadow_op::xelu_grad, mshadow_op::prelu_grad>(
+                  ctx, {out_grad[leakyrelu::kOut],
+                        in_data[leakyrelu::kData],
+                        in_data[leakyrelu::kGamma]}, req, in_grad,
+                  new_lshape, new_rshape, new_oshape);
+            });
+          } else {
+            std::vector<TBlob> new_in_grad(2);
+            new_in_grad[leakyrelu::kData] = in_grad[leakyrelu::kData];
+            new_in_grad[leakyrelu::kGamma] = in_grad[leakyrelu::kGamma].reshape(gshape);
+            BinaryBroadcastRTCBackwardUseIn {"xelu_grad", "prelu_grad"}(
+                nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
+                                         in_data[leakyrelu::kData],
+                                         in_data[leakyrelu::kGamma]},
+                  req, new_in_grad);
+          }
         }
         break;
       }
diff --git a/src/operator/nn/group_norm-inl.h b/src/operator/nn/group_norm-inl.h
index d4f5adb1804f..adb22e20244b 100644
--- a/src/operator/nn/group_norm-inl.h
+++ b/src/operator/nn/group_norm-inl.h
@@ -117,7 +117,8 @@ void GroupNormCompute(const nnvm::NodeAttrs& attrs,
   MSHADOW_REAL_TYPE_SWITCH(data.type_flag_, DType, {
     BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
       workspace_size =
-        broadcast::ReduceWorkspaceSize<NDim, DType>(s, red_dst_shape, req[0], red_src_shape);
+        broadcast::ReduceWorkspaceSize<NDim>(s, red_dst_shape, req[0],
+                                             red_src_shape, sizeof(DType));
     });
   });
 
@@ -282,14 +283,16 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
     BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
       reduce_workspace_size =
         std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim, DType>(s, red_dst_shape,
-                                                             kAddTo, red_src_shape));
+                 broadcast::ReduceWorkspaceSize<NDim>(s, red_dst_shape,
+                                                      kAddTo, red_src_shape,
+                                                      sizeof(DType)));
     });
     BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
       reduce_workspace_size =
         std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim, DType>(s, red_exclude_dst_shape, kAddTo,
-                                                             red_exclude_src_shape));
+                 broadcast::ReduceWorkspaceSize<NDim>(s, red_exclude_dst_shape, kAddTo,
+                                                      red_exclude_src_shape,
+                                                      sizeof(DType)));
     });
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index a9b356cf4f96..49ac52dc6a7c 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -110,7 +110,8 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
       workspace_size =
-        broadcast::ReduceWorkspaceSize<NDim, DType>(s, mean_data.shape_, req[0], in_data.shape_);
+        broadcast::ReduceWorkspaceSize<NDim>(s, mean_data.shape_, req[0],
+                                             in_data.shape_, sizeof(DType));
     });
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
@@ -258,14 +259,16 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
     BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
       reduce_workspace_size =
         std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim, DType>(s, red_dst_shape,
-                                                             kAddTo, red_src_shape));
+                 broadcast::ReduceWorkspaceSize<NDim>(s, red_dst_shape,
+                                                      kAddTo, red_src_shape,
+                                                      sizeof(DType)));
     });
     BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
       reduce_workspace_size =
         std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim, DType>(s, red_exclude_dst_shape, kAddTo,
-                                                             red_exclude_src_shape));
+                 broadcast::ReduceWorkspaceSize<NDim>(s, red_exclude_dst_shape, kAddTo,
+                                                      red_exclude_src_shape,
+                                                      sizeof(DType)));
     });
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 6b59ac0d8621..205fb621d4e8 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -994,8 +994,8 @@ void NumpyMomentsForward(const nnvm::NodeAttrs& attrs,
       // Get workspace and temp space for data - mean
       size_t workspace_size = 0;
       BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-        workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
-          s, dst_shape, req[0], src_shape);;
+        workspace_size = broadcast::ReduceWorkspaceSize<NDim>(
+          s, dst_shape, req[0], src_shape, sizeof(DType));
       });
       size_t temp_data_size = data.shape_.Size() * sizeof(DType);
       size_t temp_mem_size = temp_data_size + workspace_size;
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index 2fe56d2787b4..e5a017070c2a 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -46,8 +46,7 @@ NNVM_REGISTER_OP(_npi_multiply)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mul"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
-.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::right,
-                                                              mshadow_op::left>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"right", "left"});
 
 NNVM_REGISTER_OP(_npi_mod)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mod"});
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.h b/src/operator/numpy/np_elemwise_broadcast_op.h
index ce53eb6a3872..f813289cda6d 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.h
+++ b/src/operator/numpy/np_elemwise_broadcast_op.h
@@ -413,10 +413,10 @@ void NumpyBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs,
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, OType, {
       if (need_bc) {
         BROADCAST_NDIM_SWITCH(new_oshape.ndim(), ndim, {
-          workspace_size_l = ReduceWorkspaceSize<ndim, OType>(
-            s, new_lshape, req[0], new_oshape, new_lshape, new_rshape);
-          workspace_size_r = ReduceWorkspaceSize<ndim, OType>(
-            s, new_rshape, req[1], new_oshape, new_lshape, new_rshape);
+          workspace_size_l = ReduceWorkspaceSize<ndim>(
+            s, new_lshape, req[0], new_oshape, new_lshape, new_rshape, sizeof(OType));
+          workspace_size_r = ReduceWorkspaceSize<ndim>(
+            s, new_rshape, req[1], new_oshape, new_lshape, new_rshape, sizeof(OType));
         });
       }
       size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
index 96d907eda8e0..4ec85e824cda 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
@@ -44,21 +44,22 @@ NNVM_REGISTER_OP(_npi_bitwise_or)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastIntCompute<gpu, mshadow_op::bitwise_or>);
 
 NNVM_REGISTER_OP(_backward_npi_copysign)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::copysign_grad,
-                                                                  mshadow_op::copysign_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"copysign_grad",
+                                                                     "zero_grad"});
 
 NNVM_REGISTER_OP(_npi_arctan2)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"arctan2"});
 
 NNVM_REGISTER_OP(_backward_npi_arctan2)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::arctan2_grad,
-                                                                  mshadow_op::arctan2_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"arctan2_grad",
+                                                                     "arctan2_rgrad"});
+
 NNVM_REGISTER_OP(_npi_hypot)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"hypot"});
 
 NNVM_REGISTER_OP(_backward_npi_hypot)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::hypot_grad_left,
-                                                                  mshadow_op::hypot_grad_right>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"hypot_grad_left",
+                                                                     "hypot_grad_right"});
 NNVM_REGISTER_OP(_npi_copysign_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"copysign"});
 
@@ -103,8 +104,8 @@ NNVM_REGISTER_OP(_npi_rldexp_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rldexp"});
 
 NNVM_REGISTER_OP(_backward_npi_ldexp)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::ldexp_grad,
-                                                                  mshadow_op::ldexp_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"ldexp_grad",
+                                                                     "ldexp_rgrad"});
 
 NNVM_REGISTER_OP(_backward_npi_ldexp_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"ldexp_grad"});
diff --git a/src/operator/quantization/quantization_utils.h b/src/operator/quantization/quantization_utils.h
index 5230576ce594..1432baa39b1b 100644
--- a/src/operator/quantization/quantization_utils.h
+++ b/src/operator/quantization/quantization_utils.h
@@ -184,7 +184,7 @@ inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
   CHECK_EQ(src_shape->ndim(), NDim);
   CHECK_EQ(dst_shape->ndim(), NDim);
 
-  return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo, *src_shape);
+  return broadcast::ReduceWorkspaceSize<NDim>(s, *dst_shape, kWriteTo, *src_shape, sizeof(DType));
 }
 
 enum QuantizeOutType { kAuto = 0, kInt8, kUint8 };
diff --git a/src/operator/random/pdf_op.h b/src/operator/random/pdf_op.h
index ee15e993c430..fdba85f1101a 100644
--- a/src/operator/random/pdf_op.h
+++ b/src/operator/random/pdf_op.h
@@ -588,8 +588,8 @@ void PdfOpBackward(const nnvm::NodeAttrs& attrs,
   const TShape src_shape(Shape2(N, outputs[0].Size() / N)), dst_shape(Shape2(N, 1));
   // Inputs to PdfOpBackward: grad, samples, parm1, parm2, pdf.
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    const size_t red_work_size(broadcast::ReduceWorkspaceSize<2, DType>(
-            s, dst_shape, kAddTo, src_shape));
+    const size_t red_work_size(broadcast::ReduceWorkspaceSize<2>(
+            s, dst_shape, kAddTo, src_shape, sizeof(DType)));
     const size_t tmp_size(outputs[0].Size() * pnum * sizeof(DType) + red_work_size);
     Tensor<xpu, 1, char> tmp_space =
             ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(tmp_size), s);
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 7303ae47ac8d..9e3adffe9ef1 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -28,7 +28,6 @@
 
 using namespace mshadow::cuda;
 
-const int nthread_reduce = kMaxThreadsPerBlock;
 template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP, int unroll>
 __launch_bounds__(nthread_reduce)
 __global__ void reduce_kernel(const int N, const int M, const bool addto,
@@ -258,201 +257,6 @@ __global__ void reduce_kernel_M1(const int N, const bool addto,
   }
 }
 
-// Returns a/b integer division rounded up
-template<typename Type>
-Type ceil_idiv(const Type a, const Type b) {
-  return (a + b - 1)/b;
-}
-
-// Configuration for ReduceImpl()
-template<int ndim>
-struct ReduceImplConfig {
-  static const int warpSize = 32;
-  static const int unroll_reduce = 2;
-  static const int maxLoopPerTB = 64;
-  int N;
-  int M;
-  int Mnext;
-  struct {
-    dim3 blockDim;
-    dim3 gridDim;
-    int shMemSize;
-    bool do_transpose;
-  } kernel_1;
-  struct {
-    int blockSize;
-    int gridSize;
-  } kernel_2;
-  size_t workspace_size;
-
-  Shape<ndim> rshape, rstride;
-  Shape<ndim> lhs_shape, lhs_stride;
-  Shape<ndim> rhs_shape, rhs_stride;
-};
-
-static inline uint64_t calc_num_load(const int X, const int Y, const int* strides) {
-  const int warpSize = ReduceImplConfig<1>::warpSize;
-  // Number of full warps
-  uint64_t num_full_warp = X / warpSize;
-  // Length of the partial warp i.e. number of threads that are performing loads
-  uint64_t len_part_warp = X % warpSize;
-
-  uint64_t num_load_full = (std::min(warpSize, strides[0]) +
-    std::min(warpSize, strides[1]) +
-    std::min(warpSize, strides[2]))*num_full_warp;
-
-  uint64_t num_load_part =
-  (std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[0], warpSize)) +
-    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[1], warpSize)) +
-    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[2], warpSize)))*
-  (len_part_warp != 0);
-
-  uint64_t num_load = (num_load_full + num_load_part)*(uint64_t)Y;
-  return num_load;
-}
-
-template<int ndim, typename DType>
-ReduceImplConfig<ndim> ConfigureReduceImpl(const mxnet::TShape& small,
-                                           const mxnet::TShape& big,
-                                           const mxnet::TShape* lhs,
-                                           const mxnet::TShape* rhs) {
-  ReduceImplConfig<ndim> config;
-
-  diff(small.get<ndim>(), big.get<ndim>(), &config.rshape, &config.rstride);
-  config.N = small.Size();
-  config.M = config.rshape.Size();
-
-  bool multiOp = false;
-  if (lhs != nullptr) {
-    CHECK_NOTNULL(rhs);
-    diff(small.get<ndim>(), lhs->get<ndim>(), &config.lhs_shape,
-      &config.lhs_stride);
-    diff(small.get<ndim>(), rhs->get<ndim>(), &config.rhs_shape,
-      &config.rhs_stride);
-    multiOp = true;
-  }
-
-  config.workspace_size = 0;
-
-  if (config.M == 1) {
-    config.kernel_1.blockDim.x = kMaxThreadsPerBlock;
-    config.kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
-      (config.N + config.kernel_1.blockDim.x - 1)/config.kernel_1.blockDim.x);
-  } else {
-
-    int reduce_strides[3];
-    reduce_strides[0] = fastest_stride(small, big, big);
-    reduce_strides[1] = (multiOp) ? fastest_stride(small, *lhs, *lhs) : 1;
-    reduce_strides[2] = (multiOp) ? fastest_stride(small, *rhs, *rhs) : 1;
-
-    int reduce_strides_transp[3];
-    reduce_strides_transp[0] = fastest_stride(small, TShape(config.rshape),
-      TShape(config.rstride));
-    reduce_strides_transp[1] = (multiOp) ?
-      fastest_stride(small, TShape(config.lhs_shape),
-                     TShape(config.lhs_stride)) : 1;
-    reduce_strides_transp[2] = (multiOp) ?
-      fastest_stride(small, TShape(config.rhs_shape),
-                     TShape(config.rhs_stride)) : 1;
-
-    uint64_t num_load = calc_num_load(config.N, config.M, reduce_strides);
-    uint64_t num_load_transp = calc_num_load(config.M, config.N, reduce_strides_transp);
-
-    config.Mnext = 1;
-    config.kernel_1.do_transpose = (num_load > num_load_transp);
-
-    config.kernel_1.blockDim.x = 0;
-    config.kernel_1.blockDim.y = 0;
-
-    if (config.kernel_1.do_transpose) {
-      // Fastest thread ID goes through M
-      // Loop over N has step size config.kernel_1.blockDim.y
-      if (config.N < 8) {
-        config.kernel_1.blockDim.y = 1;
-      } else if (config.N < 256) {
-        config.kernel_1.blockDim.y = 4;
-      } else {
-        if (config.M < 8) {
-          config.kernel_1.blockDim.x = 1;
-        } else if (config.M < 256) {
-          config.kernel_1.blockDim.x = 4;
-        } else {
-          config.kernel_1.blockDim.x = config.warpSize;
-        }
-      }
-    } else {
-      // Fastest thread ID goes through N
-      // Loop over M has step size config.kernel_1.blockDim.y
-      if (config.M < 8) {
-        config.kernel_1.blockDim.y = 1;
-      } else if (config.M < 256) {
-        config.kernel_1.blockDim.y = 4;
-      } else {
-        if (config.N < 8) {
-          config.kernel_1.blockDim.x = 1;
-        } else if (config.N < 256) {
-          config.kernel_1.blockDim.x = 4;
-        } else {
-          config.kernel_1.blockDim.x = config.warpSize;
-        }
-      }
-    }
-
-    if (config.kernel_1.blockDim.x == 0 && config.kernel_1.blockDim.y == 0) {
-      LOG(FATAL) << "Unable to set blockDim";
-    } else if (config.kernel_1.blockDim.x == 0) {
-      config.kernel_1.blockDim.x = nthread_reduce / config.kernel_1.blockDim.y;
-    } else if (config.kernel_1.blockDim.y == 0) {
-      config.kernel_1.blockDim.y = nthread_reduce / config.kernel_1.blockDim.x;
-    }
-
-    if (config.kernel_1.do_transpose) {
-      // Fastest thread ID goes through M
-      config.kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
-        ceil_idiv<unsigned int>(config.N, config.kernel_1.blockDim.y));
-      config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
-      int by = config.kernel_1.blockDim.y;
-      if (config.kernel_1.blockDim.y % config.warpSize == 0) {
-        // Fix shared memory bank conflict
-        by++;
-      }
-      config.kernel_1.shMemSize = (config.kernel_1.blockDim.x > 1) ?
-        config.kernel_1.blockDim.x*by*sizeof(DType) * 2 : 0;
-      // Maximum number of times we want TB to loop in M
-      // Max size of M-block each TB can handle
-      int maxMblock = config.kernel_1.blockDim.x*config.maxLoopPerTB;
-      config.Mnext = (config.M + maxMblock - 1) / maxMblock;
-    } else {
-      // Fastest thread ID goes through N
-      config.kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
-        ceil_idiv<unsigned int>(config.N, config.kernel_1.blockDim.x));
-      config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
-      config.kernel_1.shMemSize = (config.kernel_1.blockDim.y > 1) ?
-        config.kernel_1.blockDim.x*config.kernel_1.blockDim.y*sizeof(DType) * 2 : 0;
-      // Maximum number of times we want TB to loop in M
-      // Max size of M-block each TB can handle
-      int maxMblock = config.kernel_1.blockDim.y*config.maxLoopPerTB;
-      config.Mnext = (config.M + maxMblock - 1) / maxMblock;
-    }
-
-    if (config.Mnext > 1) {
-      // small_dptr[] is N*Mnext*sizeof(DType) bytes
-      config.workspace_size += config.N*config.Mnext*sizeof(double);
-      // Set gridDim.y to Mnext
-      config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
-    }
-
-    if (config.Mnext > 1) {
-      config.kernel_2.blockSize = kMaxThreadsPerBlock;
-      config.kernel_2.gridSize = std::min((int)kBaseGridNum,
-        (config.N + config.kernel_2.blockSize - 1)/config.kernel_2.blockSize );
-    }
-
-  }
-
-  return config;
-}
-
 #define KERNEL_UNROLL_SWITCH(do_unroll, unrollAmount, unrollVar, ...) \
   if (do_unroll) {                                                    \
     const int unrollVar = unrollAmount;                               \
@@ -465,7 +269,7 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const mxnet::TShape& small,
 template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
                 const TBlob& big, const Tensor<gpu, 1, char>& workspace,
-                const ReduceImplConfig<ndim>& config) {
+                const ReduceImplConfig& config) {
   if (config.M == 1) {
     reduce_kernel_M1<Reducer, ndim, AType, DType, OType, OP>
     <<< config.kernel_1.gridDim, config.kernel_1.blockDim, 0, stream >>>(
@@ -487,13 +291,13 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
 
     const int by = (config.kernel_1.do_transpose) ?
       config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
-    const bool do_unroll = ( config.M / (by*config.Mnext) >= config.unroll_reduce );
-    KERNEL_UNROLL_SWITCH(do_unroll, ReduceImplConfig<ndim>::unroll_reduce, UNROLL, {
+    const bool do_unroll = ( config.M / (by*config.Mnext) >= unroll_reduce );
+    KERNEL_UNROLL_SWITCH(do_unroll, unroll_reduce, UNROLL, {
       reduce_kernel<Reducer, ndim, AType, DType, OType, OP, UNROLL>
       <<< config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, stream>>>(
         config.N, config.M, addto, big.dptr<DType>(), small_dptr, big.shape_.get<ndim>(),
-        small.shape_.get<ndim>(), config.rshape, config.rstride, config.Mnext,
-        config.kernel_1.do_transpose);
+        small.shape_.get<ndim>(), config.rshape.get<ndim>(), config.rstride.get<ndim>(),
+        config.Mnext, config.kernel_1.do_transpose);
     });
     MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel);
 
@@ -509,7 +313,7 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
 void ReduceImpl(cudaStream_t stream, const TBlob& small, const TBlob& lhs, const TBlob& rhs,
                 const OpReqType req, const TBlob& big, const Tensor<gpu, 1, char>& workspace,
-                const ReduceImplConfig<ndim>& config) {
+                const ReduceImplConfig& config) {
   if (config.M == 1) {
     reduce_kernel_M1<Reducer, ndim, DType, OP1, OP2>
     <<< config.kernel_1.gridDim, config.kernel_1.blockDim, 0, stream >>>(
@@ -532,14 +336,15 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const TBlob& lhs, const
 
     const int by = (config.kernel_1.do_transpose) ?
       config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
-    const bool do_unroll = ( config.M / (by*config.Mnext) >= config.unroll_reduce );
-    KERNEL_UNROLL_SWITCH(do_unroll, ReduceImplConfig<ndim>::unroll_reduce, UNROLL, {
+    const bool do_unroll = ( config.M / (by*config.Mnext) >= unroll_reduce );
+    KERNEL_UNROLL_SWITCH(do_unroll, unroll_reduce, UNROLL, {
       reduce_kernel<Reducer, ndim, DType, OP1, OP2, UNROLL>
       <<< config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, stream>>>(
         config.N, config.M, addto, big.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>(),
         small_dptr, big.shape_.get<ndim>(), lhs.shape_.get<ndim>(),
-        rhs.shape_.get<ndim>(), small.shape_.get<ndim>(), config.rshape, config.lhs_shape,
-        config.rhs_shape, config.rstride, config.lhs_stride, config.rhs_stride, config.Mnext,
+        rhs.shape_.get<ndim>(), small.shape_.get<ndim>(), config.rshape.get<ndim>(),
+        config.lhs_shape.get<ndim>(), config.rhs_shape.get<ndim>(), config.rstride.get<ndim>(),
+        config.lhs_stride.get<ndim>(), config.rhs_stride.get<ndim>(), config.Mnext,
         config.kernel_1.do_transpose);
       MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel);
     });
@@ -560,14 +365,14 @@ void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
             const Tensor<gpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
   cudaStream_t stream = Stream<gpu>::GetStream(s);
-  ReduceImplConfig<ndim> config =
-    ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, nullptr, nullptr);
+  ReduceImplConfig config(small.shape_, big.shape_, nullptr, nullptr, sizeof(DType));
   if (safe_acc) {
     MXNET_ACC_TYPE_SWITCH(mshadow::DataType<DType>::kFlag, DataType, AType, {
       typedef typename std::conditional<safe_acc, AType, DataType>::type AccType;
       MSHADOW_TYPE_SWITCH(small.type_flag_, OType, {
         typedef typename std::conditional<safe_acc, OType, DataType>::type OutType;
-        config = ConfigureReduceImpl<ndim, AccType>(small.shape_, big.shape_, nullptr, nullptr);
+        config = ReduceImplConfig(small.shape_, big.shape_, nullptr, nullptr,
+                                  sizeof(AccType));
         ReduceImpl<Reducer, ndim, AccType, DataType, OutType, OP>(
           stream, small, req, big, workspace, config);
       });
@@ -597,25 +402,8 @@ void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
             const TBlob& lhs, const TBlob& rhs) {
   if (req == kNullOp) return;
   cudaStream_t stream = Stream<gpu>::GetStream(s);
-  ReduceImplConfig<ndim> config =
-    ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, &lhs.shape_, &rhs.shape_);
+  ReduceImplConfig config(small.shape_, big.shape_, &lhs.shape_, &rhs.shape_, sizeof(DType));
   ReduceImpl<Reducer, ndim, DType, OP1, OP2>(stream, small, lhs, rhs, req, big, workspace, config);
 }
 
-template<int ndim, typename DType>
-size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
-                           const ::mxnet::TShape& big) {
-  if (req == kNullOp) return 0;
-  ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, nullptr, nullptr);
-  return config.workspace_size;
-}
-
-template<int ndim, typename DType>
-size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
-                           const ::mxnet::TShape& big, const ::mxnet::TShape& lhs, const ::mxnet::TShape& rhs) {
-  if (req == kNullOp) return 0;
-  ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, &lhs, &rhs);
-  return config.workspace_size;
-}
-
 #endif  //MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index b523f085726f..b263c3b01454 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -304,10 +304,6 @@ inline int fastest_stride(const TShape &small, const TShape &big,
 
 }  // namespace
 
-#ifdef __CUDACC__
-#include "broadcast_reduce-inl.cuh"
-#endif
-
 template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
                                 const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
@@ -408,19 +404,265 @@ void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
     small.shape_.get<ndim>(), rshape, rstride, ws_dptr);
 }
 
-template<int ndim, typename DType>
+template<int ndim>
 size_t ReduceWorkspaceSize(Stream<cpu> *s, const mxnet::TShape& small, const OpReqType req,
-                           const mxnet::TShape& big) {
+                           const mxnet::TShape& big, const int type_size) {
   return 0;
 }
 
-template<int ndim, typename DType>
+template<int ndim>
 size_t ReduceWorkspaceSize(Stream<cpu> *s, const mxnet::TShape& small, const OpReqType req,
                            const mxnet::TShape& big, const mxnet::TShape& lhs,
-                           const mxnet::TShape& rhs) {
+                           const mxnet::TShape& rhs, const int type_size) {
   return 0;
 }
 
+#if MXNET_USE_CUDA
+
+namespace {
+
+constexpr int warpSize = 32;
+constexpr int unroll_reduce = 2;
+
+// Returns a/b integer division rounded up
+template<typename Type>
+Type ceil_idiv(const Type a, const Type b) {
+  return (a + b - 1)/b;
+}
+
+uint64_t calc_num_load(const int X, const int Y, const int* strides) {
+  // Number of full warps
+  uint64_t num_full_warp = X / warpSize;
+  // Length of the partial warp i.e. number of threads that are performing loads
+  uint64_t len_part_warp = X % warpSize;
+
+  uint64_t num_load_full = (std::min(warpSize, strides[0]) +
+    std::min(warpSize, strides[1]) +
+    std::min(warpSize, strides[2]))*num_full_warp;
+
+  uint64_t num_load_part =
+  (std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[0], warpSize)) +
+    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[1], warpSize)) +
+    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[2], warpSize)))*
+  (len_part_warp != 0);
+
+  uint64_t num_load = (num_load_full + num_load_part)*(uint64_t)Y;
+  return num_load;
+}
+
+inline int diff(const TShape& small, const TShape& big,
+                TShape* dims, TShape* stride) {
+  int ndim = small.ndim();
+  int mdim = 0;
+  #pragma unroll
+  for (int i = 0; i < ndim; ++i) {
+    mdim += small[i] != big[i];
+    (*dims)[i] = (*stride)[i] = 1;
+  }
+
+  index_t s = 1;
+  #pragma unroll
+  for (int i = ndim - 1, j = mdim; i >= 0; --i) {
+    if (small[i] != big[i]) {
+      --j;
+      (*stride)[j] = s;
+      (*dims)[j] = big[i];
+    }
+    s *= big[i];
+  }
+  return mdim;
+}
+
+constexpr int nthread_reduce = 512;
+constexpr int kBaseGridNum = 1024;
+
+}  // namespace
+
+// Configuration for ReduceImpl()
+struct ReduceImplConfig {
+  index_t N;
+  index_t M;
+  index_t Mnext;
+  struct {
+    dim3 blockDim;
+    dim3 gridDim;
+    int shMemSize;
+    bool do_transpose;
+  } kernel_1;
+  struct {
+    int blockSize;
+    int gridSize;
+  } kernel_2;
+  size_t workspace_size;
+
+  TShape rshape, rstride;
+  TShape lhs_shape, lhs_stride;
+  TShape rhs_shape, rhs_stride;
+
+  inline ReduceImplConfig(const ::mxnet::TShape& small, const ::mxnet::TShape& big,
+                          const ::mxnet::TShape* lhs,
+                          const ::mxnet::TShape* rhs,
+                          const size_t type_size) :
+    rshape(small.ndim(), 1), rstride(small.ndim(), 1),
+    lhs_shape(small.ndim(), 1), lhs_stride(small.ndim(), 1),
+    rhs_shape(small.ndim(), 1), rhs_stride(small.ndim(), 1) {
+    constexpr int maxLoopPerTB = 64;
+    int ndim = small.ndim();
+
+    diff(small, big, &rshape, &rstride);
+    N = small.Size();
+
+    M = rshape[0];
+    for (int i = 1; i < ndim; ++i) {
+      M *= rshape[i];
+    }
+
+    bool multiOp = false;
+    if (lhs != nullptr) {
+      CHECK_NOTNULL(rhs);
+      diff(small, *lhs, &lhs_shape, &lhs_stride);
+      diff(small, *rhs, &rhs_shape, &rhs_stride);
+      multiOp = true;
+    }
+
+    workspace_size = 0;
+
+    if (M == 1) {
+      kernel_1.blockDim.x = nthread_reduce;
+      kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+          (N + kernel_1.blockDim.x - 1)/kernel_1.blockDim.x);
+    } else {
+      int reduce_strides[3];
+      reduce_strides[0] = fastest_stride(small, big, big);
+      reduce_strides[1] = (multiOp) ? fastest_stride(small, *lhs, *lhs) : 1;
+      reduce_strides[2] = (multiOp) ? fastest_stride(small, *rhs, *rhs) : 1;
+
+      int reduce_strides_transp[3];
+      reduce_strides_transp[0] = fastest_stride(small, rshape, rstride);
+      reduce_strides_transp[1] = (multiOp) ?
+        fastest_stride(small, lhs_shape, lhs_stride) : 1;
+      reduce_strides_transp[2] = (multiOp) ?
+        fastest_stride(small, rhs_shape, rhs_stride) : 1;
+
+      uint64_t num_load = calc_num_load(N, M, reduce_strides);
+      uint64_t num_load_transp = calc_num_load(M, N, reduce_strides_transp);
+
+      Mnext = 1;
+      kernel_1.do_transpose = (num_load > num_load_transp);
+
+      kernel_1.blockDim.x = 0;
+      kernel_1.blockDim.y = 0;
+
+      if (kernel_1.do_transpose) {
+        // Fastest thread ID goes through M
+        // Loop over N has step size kernel_1.blockDim.y
+        if (N < 8) {
+          kernel_1.blockDim.y = 1;
+        } else if (N < 256) {
+          kernel_1.blockDim.y = 4;
+        } else {
+          if (M < 8) {
+            kernel_1.blockDim.x = 1;
+          } else if (M < 256) {
+            kernel_1.blockDim.x = 4;
+          } else {
+            kernel_1.blockDim.x = warpSize;
+          }
+        }
+      } else {
+        // Fastest thread ID goes through N
+        // Loop over M has step size kernel_1.blockDim.y
+        if (M < 8) {
+          kernel_1.blockDim.y = 1;
+        } else if (M < 256) {
+          kernel_1.blockDim.y = 4;
+        } else {
+          if (N < 8) {
+            kernel_1.blockDim.x = 1;
+          } else if (N < 256) {
+            kernel_1.blockDim.x = 4;
+          } else {
+            kernel_1.blockDim.x = warpSize;
+          }
+        }
+      }
+
+      if (kernel_1.blockDim.x == 0 && kernel_1.blockDim.y == 0) {
+        LOG(FATAL) << "Unable to set blockDim";
+      } else if (kernel_1.blockDim.x == 0) {
+        kernel_1.blockDim.x = nthread_reduce / kernel_1.blockDim.y;
+      } else if (kernel_1.blockDim.y == 0) {
+        kernel_1.blockDim.y = nthread_reduce / kernel_1.blockDim.x;
+      }
+
+      if (kernel_1.do_transpose) {
+        // Fastest thread ID goes through M
+        kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+            ceil_idiv<unsigned int>(N, kernel_1.blockDim.y));
+        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
+        int by = kernel_1.blockDim.y;
+        if (kernel_1.blockDim.y % warpSize == 0) {
+          // Fix shared memory bank conflict
+          by++;
+        }
+        kernel_1.shMemSize = (kernel_1.blockDim.x > 1) ?
+          kernel_1.blockDim.x*by*type_size * 2 : 0;
+        // Maximum number of times we want TB to loop in M
+        // Max size of M-block each TB can handle
+        int maxMblock = kernel_1.blockDim.x*maxLoopPerTB;
+        Mnext = (M + maxMblock - 1) / maxMblock;
+      } else {
+        // Fastest thread ID goes through N
+        kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+            ceil_idiv<unsigned int>(N, kernel_1.blockDim.x));
+        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
+        kernel_1.shMemSize = (kernel_1.blockDim.y > 1) ?
+          kernel_1.blockDim.x*kernel_1.blockDim.y*type_size * 2 : 0;
+        // Maximum number of times we want TB to loop in M
+        // Max size of M-block each TB can handle
+        int maxMblock = kernel_1.blockDim.y*maxLoopPerTB;
+        Mnext = (M + maxMblock - 1) / maxMblock;
+      }
+
+      if (Mnext > 1) {
+        // small_dptr[] is N*Mnext*type_size bytes
+        workspace_size += N*Mnext*sizeof(double);
+        // Set gridDim.y to Mnext
+        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
+      }
+
+      if (Mnext > 1) {
+        kernel_2.blockSize = nthread_reduce;
+        kernel_2.gridSize = std::min(kBaseGridNum,
+            (N + kernel_2.blockSize - 1)/kernel_2.blockSize);
+      }
+    }
+  }
+};
+
+template<int ndim>
+size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
+                           const ::mxnet::TShape& big, const int type_size) {
+  if (req == kNullOp) return 0;
+  ReduceImplConfig config(small, big, nullptr, nullptr, type_size);
+  return config.workspace_size;
+}
+
+template<int ndim>
+size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
+                           const ::mxnet::TShape& big, const ::mxnet::TShape& lhs,
+                           const ::mxnet::TShape& rhs, const int type_size) {
+  if (req == kNullOp) return 0;
+  ReduceImplConfig config(small, big, &lhs, &rhs, type_size);
+  return config.workspace_size;
+}
+
+#ifdef __CUDACC__
+#include "broadcast_reduce-inl.cuh"
+#endif
+
+#endif  // MXNET_USE_CUDA
+
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
 MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
                                        const DType* __restrict big, const DType* __restrict lhs,
@@ -507,6 +749,19 @@ void RTCReduce(const NodeAttrs& attrs,
                int ndim,
                const std::string& OP);
 
+void RTCReduce(const NodeAttrs& attrs,
+               const OpContext& ctx,
+               const TBlob& small,
+               const OpReqType req,
+               const Tensor<gpu, 1, char>& workspace,
+               const TBlob& big,
+               const TBlob &lhs,
+               const TBlob &rhs,
+               const std::string& reducer,
+               int ndim,
+               const std::string& OP1,
+               const std::string& OP2);
+
 #endif
 
 }  // namespace broadcast
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index ab187f6d54f7..35624287b5a0 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -633,8 +633,8 @@ void ReduceAxesComputeImpl(const OpContext& ctx,
       const TBlob in_data = inputs[0].reshape(src_shape);
       const TBlob out_data = outputs[0].reshape(dst_shape);
       BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-        size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
-            s, out_data.shape_, req[0], in_data.shape_);
+        size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim>(
+            s, out_data.shape_, req[0], in_data.shape_, sizeof(DType));
         Tensor<xpu, 1, char> workspace =
             ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
         broadcast::Reduce<reducer, NDim, DType, OP, safe_acc>(
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op-inl.cuh b/src/operator/tensor/elemwise_binary_broadcast_op-inl.cuh
deleted file mode 100644
index d65e12aef86e..000000000000
--- a/src/operator/tensor/elemwise_binary_broadcast_op-inl.cuh
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file elemwise_binary_broadcast_op-inl.cuh
- * \brief CUDA specific Function definition of elementwise binary broadcast operators
- */
-#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_BROADCAST_OP_CUH_
-#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_BROADCAST_OP_CUH_
-#include <mxnet/operator_util.h>
-#include <mxnet/op_attr_types.h>
-#include <algorithm>
-#include <vector>
-#include <string>
-#include <utility>
-#include "broadcast_reduce-inl.h"
-namespace mxnet {
-namespace op {
-template<typename xpu, typename LOP, typename ROP>
-inline typename std::enable_if<std::is_same<xpu, gpu>::value, void>::type
-BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs,
-                               const OpContext& ctx,
-                               const std::vector<TBlob>& inputs,
-                               const std::vector<OpReqType>& req,
-                               const std::vector<TBlob>& outputs) {
-  using namespace broadcast;
-  mxnet::TShape new_lshape, new_rshape, new_oshape;
-  int ndim = BinaryBroadcastShapeCompact(outputs[0].shape_, outputs[1].shape_, inputs[0].shape_,
-                                         &new_lshape, &new_rshape, &new_oshape);
-  if (!ndim) {
-    ElemwiseBinaryOp::BackwardUseNone<gpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
-  } else {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      Stream<gpu> *s = ctx.get_stream<gpu>();
-      const TBlob lhs = outputs[0].reshape(new_lshape);
-      const TBlob rhs = outputs[1].reshape(new_rshape);
-      const TBlob out = inputs[0].reshape(new_oshape);
-      BROADCAST_NDIM_SWITCH(ndim, NDim, {
-        // Request temporary storage
-        size_t workspace_size = new_oshape.Size();
-        Tensor<gpu, 1, char> workspace =
-            ctx.requested[0].get_space_typed<gpu, 1, char>(
-                Shape1(workspace_size * sizeof(index_t)), s);
-        if (out.shape_.Size() != 0) {
-          Reduce<red::sum, NDim, DType, LOP>(s, lhs, req[0], workspace, out);
-          Reduce<red::sum, NDim, DType, ROP>(s, rhs, req[1], workspace, out);
-        } else {
-          using namespace mxnet_op;
-          if (lhs.shape_.Size() != 0) {
-            MSHADOW_TYPE_SWITCH(lhs.type_flag_, LType, {
-              Kernel<set_zero, gpu>::Launch(s, lhs.shape_.Size(), lhs.dptr<LType>());
-            });
-          }
-          if (rhs.shape_.Size() != 0) {
-            MSHADOW_TYPE_SWITCH(rhs.type_flag_, RType, {
-              Kernel<set_zero, gpu>::Launch(s, rhs.shape_.Size(), rhs.dptr<RType>());
-            });
-          }
-        }
-      });
-    });
-  }
-}
-} // namespace op
-} // namespace mxnet
-#endif
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index 4647acf9d994..b5e267771662 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -270,7 +270,7 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
     const auto& lstride = calc_stride(lhs.shape_, ndim);
     const auto& rstride = calc_stride(rhs.shape_, ndim);
 
-    size_t output_type_size = util::mshadow_type_info(outputs[0].type_flag_).size;
+    size_t output_type_size = common::mshadow_type_info(outputs[0].type_flag_).size;
     const int nvec = output_type_size <= sizeof(uint64_t)
                        ? (sizeof(uint64_t) / output_type_size)
                        : 1;
@@ -381,12 +381,12 @@ void BinaryBroadcastRTCBackwardUseNone::operator()(const nnvm::NodeAttrs& attrs,
         using namespace common::cuda::rtc::util;
         if (lhs.shape_.Size() != 0) {
           cudaMemsetAsync(lhs.dptr_, 0,
-                          lhs.shape_.Size() * mshadow_type_info(lhs.type_flag_).size,
+                          lhs.shape_.Size() * common::mshadow_type_info(lhs.type_flag_).size,
                           Stream<gpu>::GetStream(s));
         }
         if (rhs.shape_.Size() != 0) {
           cudaMemsetAsync(rhs.dptr_, 0,
-                          rhs.shape_.Size() * mshadow_type_info(rhs.type_flag_).size,
+                          rhs.shape_.Size() * common::mshadow_type_info(rhs.type_flag_).size,
                           Stream<gpu>::GetStream(s));
         }
       }
@@ -399,7 +399,48 @@ void BinaryBroadcastRTCBackwardUseIn::operator()(const nnvm::NodeAttrs& attrs,
                                                  const std::vector<TBlob>& inputs,
                                                  const std::vector<OpReqType>& req,
                                                  const std::vector<TBlob>& outputs) {
-  LOG(FATAL) << "Not implemented yet!";
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 2U);
+  // skip kernel launch for zero-size tensors
+  if (inputs[0].shape_.Size() == 0U) {
+    return;
+  }
+  mxnet::TShape new_lshape, new_rshape, new_oshape;
+  const bool need_bc = BinaryBroadcastShapeCompact(outputs[0].shape_,
+                                                   outputs[1].shape_, inputs[0].shape_,
+                                                   &new_lshape, &new_rshape, &new_oshape) != 0;
+  if (!need_bc) {
+    ElemwiseBinaryRTCBwdUseIn {LOP, ROP}(attrs, ctx, inputs, req, outputs);
+  } else {
+    BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
+        using namespace mshadow;
+        Stream<gpu> *s = ctx.get_stream<gpu>();
+        const TBlob lgrad = outputs[0].reshape(new_lshape);
+        const TBlob rgrad = outputs[1].reshape(new_rshape);
+        const TBlob ograd = inputs[0].reshape(new_oshape);
+        const TBlob lhs = inputs[1].reshape(new_lshape);
+        const TBlob rhs = inputs[2].reshape(new_rshape);
+        size_t workspace_size_l = broadcast::ReduceWorkspaceSize<NDim>(
+            s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_,
+            rhs.shape_, common::mshadow_type_info(outputs[0].type_flag_).size);
+        size_t workspace_size_r = broadcast::ReduceWorkspaceSize<NDim>(
+            s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_,
+            rhs.shape_, common::mshadow_type_info(outputs[1].type_flag_).size);
+        size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
+        Tensor<gpu, 1, char> workspace =
+            ctx.requested[0].get_space_typed<gpu, 1, char>(Shape1(workspace_size), s);
+        if (req[0] != kNullOp) {
+          broadcast::RTCReduce(attrs, ctx, lgrad, req[0], workspace,
+                               ograd, lhs, rhs, "red::sum", NDim,
+                               "mul", LOP);
+        }
+        if (req[1] != kNullOp) {
+          broadcast::RTCReduce(attrs, ctx, rgrad, req[1], workspace,
+                               ograd, lhs, rhs, "red::sum", NDim,
+                               "mul", ROP);
+        }
+    });
+  }
 }
 
 #endif  // MXNET_USE_CUDA
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 8c44a1a79ac9..a6b4d483c73d 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -628,10 +628,10 @@ inline void BinaryBroadcastBackwardUseInImpl(const OpContext& ctx,
   const TBlob ograd = inputs[0].reshape(new_oshape);
   const TBlob lhs = inputs[1].reshape(new_lshape);
   const TBlob rhs = inputs[2].reshape(new_rshape);
-  size_t workspace_size_l = ReduceWorkspaceSize<ndim, DType>(
-      s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_, rhs.shape_);
-  size_t workspace_size_r = ReduceWorkspaceSize<ndim, DType>(
-      s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_, rhs.shape_);
+  size_t workspace_size_l = ReduceWorkspaceSize<ndim>(
+      s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_, rhs.shape_, sizeof(DType));
+  size_t workspace_size_r = ReduceWorkspaceSize<ndim>(
+      s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_, rhs.shape_, sizeof(DType));
   size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
   Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
@@ -686,7 +686,4 @@ void BinaryBroadcastBackwardUseIn(const nnvm::NodeAttrs& attrs,
 
 }  // namespace op
 }  // namespace mxnet
-#ifdef __CUDACC__
-#include "./elemwise_binary_broadcast_op-inl.cuh"
-#endif
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_BROADCAST_OP_H_
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
index 07fbb8363530..adc1dbb12cb9 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
@@ -47,23 +47,20 @@ NNVM_REGISTER_OP(broadcast_mul)
 .set_attr<FComputeEx>("FComputeEx<gpu>", BinaryBroadcastComputeSparseEx<gpu, op::mshadow_op::mul>);
 
 NNVM_REGISTER_OP(_backward_broadcast_mul)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::right,
-                                                                mshadow_op::left>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"right", "left"});
 
 NNVM_REGISTER_OP(broadcast_div)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"div"})
 .set_attr<FComputeEx>("FComputeEx<gpu>", BinaryBroadcastComputeSparseEx<gpu, op::mshadow_op::div>);
 
 NNVM_REGISTER_OP(_backward_broadcast_div)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::div_grad,
-                                                                mshadow_op::div_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"div_grad", "div_rgrad"});
 
 NNVM_REGISTER_OP(broadcast_mod)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_broadcast_mod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::mod_grad,
-                                                                  mshadow_op::mod_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"mod_grad", "mod_rgrad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
index 3ad4bdc430ce..042a4da2b688 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
@@ -32,29 +32,26 @@ NNVM_REGISTER_OP(broadcast_power)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_broadcast_power)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::power_grad,
-                                                              mshadow_op::power_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"power_grad", "power_rgrad"});
 
 NNVM_REGISTER_OP(broadcast_maximum)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"max"});
 
 NNVM_REGISTER_OP(_backward_broadcast_maximum)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::ge,
-                                                              mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"greater_equal", "less"});
 
 NNVM_REGISTER_OP(broadcast_minimum)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"min"});
 
 NNVM_REGISTER_OP(_backward_broadcast_minimum)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::le,
-                                                              mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"less_equal", "greater"});
 
 NNVM_REGISTER_OP(broadcast_hypot)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"hypot"});
 
 NNVM_REGISTER_OP(_backward_broadcast_hypot)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::hypot_grad_left,
-                                                              mshadow_op::hypot_grad_right>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"hypot_grad_left",
+                                                                     "hypot_grad_right"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
index d52886b8842c..861e498ff7c6 100644
--- a/src/operator/tensor/elemwise_binary_op.cc
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -417,7 +417,7 @@ void ElemwiseBinaryRTCBwdUseIn::operator()(const nnvm::NodeAttrs& attrs,
                            "\n" +
                            binary_kernel_bwd_use_in;
   // Using 64 bit loads to reduce register pressure
-  int output_type_size = util::mshadow_type_info(outputs[0].type_flag_).size;
+  size_t output_type_size = common::mshadow_type_info(outputs[0].type_flag_).size;
   const int nvec = output_type_size <= sizeof(uint64_t)
                      ? (sizeof(uint64_t) / output_type_size)
                      : 1;
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index bf212ef8242e..cc695c826d42 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -111,7 +111,7 @@ void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
   mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
   if (req[0] == kNullOp) return;
   CHECK_EQ(outputs.size(), 1U);
-  size_t output_type_size = util::mshadow_type_info(outputs[0].type_flag_).size;
+  size_t output_type_size = common::mshadow_type_info(outputs[0].type_flag_).size;
   const int nvec = output_type_size <= sizeof(uint2)
                      ? (sizeof(uint2) / output_type_size)
                      : 1;
diff --git a/src/operator/tensor/reduce_rtc.cc b/src/operator/tensor/reduce_rtc.cc
index 7db469e28969..84a60931ae65 100644
--- a/src/operator/tensor/reduce_rtc.cc
+++ b/src/operator/tensor/reduce_rtc.cc
@@ -34,248 +34,56 @@ namespace broadcast {
 
 namespace {
 
-constexpr int nthread_reduce = 512;
-constexpr int kBaseGridNum = 1024;
-
-int diff(const TShape& small, const TShape& big, TShape* dims,
-  TShape* stride) {
-  int ndim = small.ndim();
-  int mdim = 0;
-  #pragma unroll
-  for (int i = 0; i < ndim; ++i) {
-    mdim += small[i] != big[i];
-    (*dims)[i] = (*stride)[i] = 1;
-  }
-
-  index_t s = 1;
-  #pragma unroll
-  for (int i = ndim - 1, j = mdim; i >= 0; --i) {
-    if (small[i] != big[i]) {
-      --j;
-      (*stride)[j] = s;
-      (*dims)[j] = big[i];
-    }
-    s *= big[i];
-  }
-  return mdim;
-}
-
-constexpr int warpSize = 32;
-constexpr int unroll_reduce = 2;
-constexpr int maxLoopPerTB = 64;
-
-// Returns a/b integer division rounded up
-template<typename Type>
-Type ceil_idiv(const Type a, const Type b) {
-  return (a + b - 1)/b;
-}
-
-uint64_t calc_num_load(const int X, const int Y, const int* strides) {
-  // Number of full warps
-  uint64_t num_full_warp = X / warpSize;
-  // Length of the partial warp i.e. number of threads that are performing loads
-  uint64_t len_part_warp = X % warpSize;
-
-  uint64_t num_load_full = (std::min(warpSize, strides[0]) +
-    std::min(warpSize, strides[1]) +
-    std::min(warpSize, strides[2]))*num_full_warp;
-
-  uint64_t num_load_part =
-  (std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[0], warpSize)) +
-    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[1], warpSize)) +
-    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[2], warpSize)))*
-  (len_part_warp != 0);
-
-  uint64_t num_load = (num_load_full + num_load_part)*(uint64_t)Y;
-  return num_load;
-}
-
-struct RTCReduceImplConfig {
-  index_t N;
-  index_t M;
-  index_t Mnext;
-  struct {
-    dim3 blockDim;
-    dim3 gridDim;
-    int shMemSize;
-    bool do_transpose;
-  } kernel_1;
-  struct {
-    int blockSize;
-    int gridSize;
-  } kernel_2;
-  size_t workspace_size;
-
-  TShape rshape, rstride;
-  TShape lhs_shape, lhs_stride;
-  TShape rhs_shape, rhs_stride;
-
-  RTCReduceImplConfig(const ::mxnet::TShape& small, const ::mxnet::TShape& big,
-                      const size_t type_size, const ::mxnet::TShape* lhs,
-                      const ::mxnet::TShape* rhs) :
-    rshape(small.ndim(), 1), rstride(small.ndim(), 1),
-    lhs_shape(small.ndim(), 1), lhs_stride(small.ndim(), 1),
-    rhs_shape(small.ndim(), 1), rhs_stride(small.ndim(), 1) {
-    int ndim = small.ndim();
-
-    diff(small, big, &rshape, &rstride);
-    N = small.Size();
-
-    M = rshape[0];
-    for (int i = 1; i < ndim; ++i) {
-      M *= rshape[i];
-    }
-
-    bool multiOp = false;
-    if (lhs != nullptr) {
-      CHECK_NOTNULL(rhs);
-      diff(small, *lhs, &lhs_shape, &lhs_stride);
-      diff(small, *rhs, &rhs_shape, &rhs_stride);
-      multiOp = true;
-    }
-
-    workspace_size = 0;
-
-    if (M == 1) {
-      kernel_1.blockDim.x = nthread_reduce;
-      kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
-          (N + kernel_1.blockDim.x - 1)/kernel_1.blockDim.x);
-    } else {
-
-      int reduce_strides[3];
-      reduce_strides[0] = fastest_stride(small, big, big);
-      reduce_strides[1] = (multiOp) ? fastest_stride(small, *lhs, *lhs) : 1;
-      reduce_strides[2] = (multiOp) ? fastest_stride(small, *rhs, *rhs) : 1;
-
-      int reduce_strides_transp[3];
-      reduce_strides_transp[0] = fastest_stride(small, rshape, rstride);
-      reduce_strides_transp[1] = (multiOp) ?
-        fastest_stride(small, lhs_shape, lhs_stride) : 1;
-      reduce_strides_transp[2] = (multiOp) ?
-        fastest_stride(small, rhs_shape, rhs_stride) : 1;
-
-      uint64_t num_load = calc_num_load(N, M, reduce_strides);
-      uint64_t num_load_transp = calc_num_load(M, N, reduce_strides_transp);
-
-      Mnext = 1;
-      kernel_1.do_transpose = (num_load > num_load_transp);
-
-      kernel_1.blockDim.x = 0;
-      kernel_1.blockDim.y = 0;
-
-      if (kernel_1.do_transpose) {
-        // Fastest thread ID goes through M
-        // Loop over N has step size kernel_1.blockDim.y
-        if (N < 8) {
-          kernel_1.blockDim.y = 1;
-        } else if (N < 256) {
-          kernel_1.blockDim.y = 4;
-        } else {
-          if (M < 8) {
-            kernel_1.blockDim.x = 1;
-          } else if (M < 256) {
-            kernel_1.blockDim.x = 4;
-          } else {
-            kernel_1.blockDim.x = warpSize;
-          }
-        }
-      } else {
-        // Fastest thread ID goes through N
-        // Loop over M has step size kernel_1.blockDim.y
-        if (M < 8) {
-          kernel_1.blockDim.y = 1;
-        } else if (M < 256) {
-          kernel_1.blockDim.y = 4;
-        } else {
-          if (N < 8) {
-            kernel_1.blockDim.x = 1;
-          } else if (N < 256) {
-            kernel_1.blockDim.x = 4;
-          } else {
-            kernel_1.blockDim.x = warpSize;
-          }
-        }
-      }
-
-      if (kernel_1.blockDim.x == 0 && kernel_1.blockDim.y == 0) {
-        LOG(FATAL) << "Unable to set blockDim";
-      } else if (kernel_1.blockDim.x == 0) {
-        kernel_1.blockDim.x = nthread_reduce / kernel_1.blockDim.y;
-      } else if (kernel_1.blockDim.y == 0) {
-        kernel_1.blockDim.y = nthread_reduce / kernel_1.blockDim.x;
-      }
-
-      if (kernel_1.do_transpose) {
-        // Fastest thread ID goes through M
-        kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
-            ceil_idiv<unsigned int>(N, kernel_1.blockDim.y));
-        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
-        int by = kernel_1.blockDim.y;
-        if (kernel_1.blockDim.y % warpSize == 0) {
-          // Fix shared memory bank conflict
-          by++;
-        }
-        kernel_1.shMemSize = (kernel_1.blockDim.x > 1) ?
-          kernel_1.blockDim.x*by*type_size * 2 : 0;
-        // Maximum number of times we want TB to loop in M
-        // Max size of M-block each TB can handle
-        int maxMblock = kernel_1.blockDim.x*maxLoopPerTB;
-        Mnext = (M + maxMblock - 1) / maxMblock;
-      } else {
-        // Fastest thread ID goes through N
-        kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
-            ceil_idiv<unsigned int>(N, kernel_1.blockDim.x));
-        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
-        kernel_1.shMemSize = (kernel_1.blockDim.y > 1) ?
-          kernel_1.blockDim.x*kernel_1.blockDim.y*type_size * 2 : 0;
-        // Maximum number of times we want TB to loop in M
-        // Max size of M-block each TB can handle
-        int maxMblock = kernel_1.blockDim.y*maxLoopPerTB;
-        Mnext = (M + maxMblock - 1) / maxMblock;
-      }
-
-      if (Mnext > 1) {
-        // small_dptr[] is N*Mnext*type_size bytes
-        workspace_size += N*Mnext*sizeof(double);
-        // Set gridDim.y to Mnext
-        kernel_1.gridDim.y = std::min(kBaseGridNum, Mnext);
-      }
-
-      if (Mnext > 1) {
-        kernel_2.blockSize = nthread_reduce;
-        kernel_2.gridSize = std::min((int)kBaseGridNum,
-            (N + kernel_2.blockSize - 1)/kernel_2.blockSize );
-      }
-
-    }
-  }
-
-};
-
 struct reduce_kernel_params {
   index_t big_shape[MAX_DIM];
   index_t small_shape[MAX_DIM];
+  index_t lhs_shape0[MAX_DIM];
+  index_t rhs_shape0[MAX_DIM];
   index_t rshape[MAX_DIM];
   index_t rstride[MAX_DIM];
+  index_t lhs_stride[MAX_DIM];
+  index_t rhs_stride[MAX_DIM];
+  index_t lhs_shape[MAX_DIM];
+  index_t rhs_shape[MAX_DIM];
 };
 
+const char reduce_function_code[] = R"code(
+#define FUNC OP(IType0::from(big[idx_big[u]]))
+)code";
+
+const char reduce_function_use_input_code[] = R"code(
+#define FUNC OP1(IType0::from(big[idx_big[u]]),     \
+                 OP2(IType1::from(lhs[idx_lhs[u]]), \
+                     IType2::from(rhs[idx_rhs[u]])))
+)code";
+
 const char reduce_kernel_code[] = R"code(
 struct reduce_kernel_params {
   index_t big_shape[util::MAX_DIM];
   index_t small_shape[util::MAX_DIM];
+  index_t lhs_shape0[util::MAX_DIM];
+  index_t rhs_shape0[util::MAX_DIM];
   index_t rshape[util::MAX_DIM];
   index_t rstride[util::MAX_DIM];
+  index_t lhs_stride[util::MAX_DIM];
+  index_t rhs_stride[util::MAX_DIM];
+  index_t lhs_shape[util::MAX_DIM];
+  index_t rhs_shape[util::MAX_DIM];
 };
 
 __global__ void reduce_kernel(const int N, const int M, const bool addto,
                               const InputType0* __restrict big,
+                              const InputType1* __restrict lhs,
+                              const InputType2* __restrict rhs,
                               OutputType0 *small,
                               const reduce_kernel_params params,
                               const int Mnext) {
   extern __shared__ char shTileChar[];
-  using IType = AccType<InputType0>;
+  using IType0 = AccType<InputType0>;
+  using IType1 = AccType<InputType1>;
+  using IType2 = AccType<InputType2>;
   using OType = AccType<OutputType0>;
-  using AType = typename IType::type;
+  using AType = typename IType0::type;
   AType* shTile = (AType*)(shTileChar);
   const int tid = threadIdx.x + threadIdx.y*blockDim.x;
   const int bx = (do_transpose) ? blockDim.y : blockDim.x;
@@ -288,23 +96,38 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
     const index_t Mend   = (index_t)((int64)M*(int64)(m0 + 1)/(int64)Mnext);
     for (index_t idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) {
       int idx = idx0 + tidx;
-      index_t idx_big0 = util::unravel_ravel<ndim>(idx, params.small_shape, params.big_shape);
+      index_t coord[ndim];
+      util::unravel(idx, params.small_shape, coord);
+      index_t idx_big0, idx_lhs0, idx_rhs0;
+      idx_big0 = util::ravel(coord, params.big_shape);
+      if (use_input) {
+        idx_lhs0 = util::ravel(coord, params.lhs_shape0);
+        idx_rhs0 = util::ravel(coord, params.rhs_shape0);
+      }
 
       AType val, residual;
       REDUCER::SetInitValue(val, residual);
       if (idx < N) {
         for (index_t k = tidy + Mstart; k < Mend; k += by*UNROLL) {
           index_t idx_big[UNROLL];
+          index_t idx_lhs[UNROLL];
+          index_t idx_rhs[UNROLL];
           #pragma unroll
           for (int u=0;u < UNROLL;u++) {
             idx_big[u] = idx_big0 + util::unravel_dot<ndim>(k + u*by, params.rshape,
                                                             params.rstride);
+            if (use_input) {
+              idx_lhs[u] = idx_lhs0 + util::unravel_dot<ndim>(k + u*by, params.lhs_shape,
+                                                              params.lhs_stride);
+              idx_rhs[u] = idx_rhs0 + util::unravel_dot<ndim>(k + u*by, params.rhs_shape,
+                                                              params.rhs_stride);
+            }
           }
           typename OType::type tmp[UNROLL];
           #pragma unroll
           for (int u=0;u < UNROLL;u++) {
             if (k + u*by < Mend) {
-              tmp[u] = OP(OType::from(big[idx_big[u]]));
+              tmp[u] = FUNC;
             }
           }
           #pragma unroll
@@ -385,8 +208,9 @@ __global__ void reduce_lines_kernel(const index_t N, const index_t M,
 
 void RTCReduceImpl(Stream<gpu> *s, const TBlob& small, const bool addto,
                 const TBlob& big, const Tensor<gpu, 1, char>& workspace,
-                const RTCReduceImplConfig& config, const int ndim,
-                const std::string &common_code, int dev_id) {
+                const ReduceImplConfig& config, const int ndim,
+                const std::string &common_code, int dev_id,
+                const TBlob *lhs = nullptr, const TBlob *rhs = nullptr) {
   using namespace common::cuda::rtc;
   void* small_dptr = small.dptr_;
   bool first_kernel_addto = addto;
@@ -402,7 +226,7 @@ void RTCReduceImpl(Stream<gpu> *s, const TBlob& small, const bool addto,
 
   const int by = (config.kernel_1.do_transpose) ?
     config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
-  const bool do_unroll = ( config.M / (by*config.Mnext) >= unroll_reduce );
+  const bool do_unroll = (config.M / (by*config.Mnext) >= unroll_reduce);
   std::string code = common_code +
                      "#define UNROLL " +
                      (do_unroll ? std::to_string(unroll_reduce) : "1") +
@@ -411,11 +235,26 @@ void RTCReduceImpl(Stream<gpu> *s, const TBlob& small, const bool addto,
                      (config.kernel_1.do_transpose ? "true" : "false") +
                      ";\n"
                      "using InputType0 = " +
-                     util::mshadow_type_info(big.type_flag_).name +
+                     common::mshadow_type_info(big.type_flag_).name +
                      ";\n"
                      "using OutputType0 = " +
-                     util::mshadow_type_info(small.type_flag_).name +
+                     common::mshadow_type_info(small.type_flag_).name +
+                     ";\n"
+                     "using InputType1 = " +
+                     ((lhs != nullptr)
+                     ? common::mshadow_type_info(lhs->type_flag_).name
+                     : "float32") +
+                     ";\n"
+                     "using InputType2 = " +
+                     ((rhs != nullptr)
+                     ? common::mshadow_type_info(rhs->type_flag_).name
+                     : "float32") +
                      ";\n";
+  if (lhs != nullptr) {
+    code += "const bool use_input = true;";
+  } else {
+    code += "const bool use_input = false;";
+  }
 
   reduce_kernel_params param {};
   for (int i = 0; i < ndim; ++i) {
@@ -423,19 +262,41 @@ void RTCReduceImpl(Stream<gpu> *s, const TBlob& small, const bool addto,
     param.small_shape[i] = small.shape_[i];
     param.rshape[i] = config.rshape[i];
     param.rstride[i] = config.rstride[i];
+    if (lhs != nullptr) {
+      param.lhs_shape0[i] = lhs->shape_[i];
+      param.rhs_shape0[i] = rhs->shape_[i];
+      param.lhs_shape[i] = config.lhs_shape[i];
+      param.rhs_shape[i] = config.rhs_shape[i];
+      param.lhs_stride[i] = config.lhs_stride[i];
+      param.rhs_stride[i] = config.rhs_stride[i];
+    }
   }
 
+  void *null_ptr = nullptr;
   std::vector<const void*> args;
   args.emplace_back(&config.N);
   args.emplace_back(&config.M);
   args.emplace_back(&first_kernel_addto);
   args.emplace_back(&big.dptr_);
+  if (lhs != nullptr) {
+    args.emplace_back(&(lhs->dptr_));
+    args.emplace_back(&(rhs->dptr_));
+  } else {
+    args.emplace_back(&(null_ptr));
+    args.emplace_back(&(null_ptr));
+  }
   args.emplace_back(&small_dptr);
   args.emplace_back(&param);
   args.emplace_back(&config.Mnext);
 
-  auto reduce_kernel_func = get_function(code + reduce_kernel_code, "reduce_kernel", dev_id);
-  launch(reduce_kernel_func, config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, s, &args);
+  const auto &function_code = (lhs == nullptr)
+                            ? reduce_function_code
+                            : reduce_function_use_input_code;
+  auto reduce_kernel_func = get_function(code + function_code + reduce_kernel_code,
+                                         "reduce_kernel", dev_id);
+  launch(reduce_kernel_func, config.kernel_1.gridDim,
+         config.kernel_1.blockDim,
+         config.kernel_1.shMemSize, s, &args);
 
   if (config.Mnext > 1) {
     args.resize(0);
@@ -452,6 +313,91 @@ void RTCReduceImpl(Stream<gpu> *s, const TBlob& small, const bool addto,
   }
 }
 
+struct reduce_kernel_M1_params {
+  index_t big_shape[MAX_DIM];
+  index_t lhs_shape[MAX_DIM];
+  index_t rhs_shape[MAX_DIM];
+  index_t small_shape[MAX_DIM];
+};
+
+const char reduce_kernel_M1_code[] = R"code(
+struct reduce_kernel_M1_params {
+  index_t big_shape[util::MAX_DIM];
+  index_t lhs_shape[util::MAX_DIM];
+  index_t rhs_shape[util::MAX_DIM];
+  index_t small_shape[util::MAX_DIM];
+};
+
+__global__ void reduce_kernel_M1(const int N,
+                                 const InputType0* __restrict big,
+                                 const InputType1* __restrict lhs,
+                                 const InputType2* __restrict rhs,
+                                 OutputType0 *small,
+                                 const reduce_kernel_M1_params params) {
+  using IType0 = AccType<InputType0>;
+  using IType1 = AccType<InputType1>;
+  using IType2 = AccType<InputType2>;
+  using OType = AccType<OutputType0>;
+  for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
+    index_t coord[ndim];
+    util::unravel(idx, params.small_shape, coord);
+    const index_t idx_big = util::ravel(coord, params.big_shape);
+    const index_t idx_lhs = util::ravel(coord, params.lhs_shape);
+    const index_t idx_rhs = util::ravel(coord, params.rhs_shape);
+    const typename OType::type val =
+      OP1(IType0::from(big[idx_big]), OP2(IType1::from(lhs[idx_lhs]),
+                                          IType2::from(rhs[idx_rhs])));
+    if (req == OpReqType::kAddTo) {
+      const auto temp = op::add(val, OType::from(small[idx]));
+      small[idx] = OType::to(temp);
+    } else {
+      small[idx] = OType::to(val);
+    }
+  }
+}
+)code";
+
+void RTCReduceM1Impl(Stream<gpu> *s, const TBlob &small, const TBlob &big,
+                     const TBlob &lhs, const TBlob &rhs,
+                     const ReduceImplConfig &config, const int ndim,
+                     const std::string &common_code, int dev_id) {
+  using namespace common::cuda::rtc;
+
+  std::string code = common_code +
+                     "using InputType0 = " +
+                     common::mshadow_type_info(big.type_flag_).name +
+                     ";\n"
+                     "using InputType1 = " +
+                     common::mshadow_type_info(lhs.type_flag_).name +
+                     ";\n"
+                     "using InputType2 = " +
+                     common::mshadow_type_info(rhs.type_flag_).name +
+                     ";\n"
+                     "using OutputType0 = " +
+                     common::mshadow_type_info(small.type_flag_).name +
+                     ";\n";
+  reduce_kernel_M1_params param {};
+  for (int i = 0; i < ndim; ++i) {
+    param.big_shape[i] = big.shape_[i];
+    param.small_shape[i] = small.shape_[i];
+    param.lhs_shape[i] = lhs.shape_[i];
+    param.rhs_shape[i] = rhs.shape_[i];
+  }
+
+  std::vector<const void*> args;
+  args.emplace_back(&config.N);
+  args.emplace_back(&big.dptr_);
+  args.emplace_back(&lhs.dptr_);
+  args.emplace_back(&rhs.dptr_);
+  args.emplace_back(&small.dptr_);
+  args.emplace_back(&param);
+
+  auto reduce_kernel_M1_func = get_function(code + reduce_kernel_M1_code,
+                                            "reduce_kernel_M1", dev_id);
+  launch(reduce_kernel_M1_func, config.kernel_1.gridDim,
+         config.kernel_1.blockDim,
+         config.kernel_1.shMemSize, s, &args);
+}
 
 }  // namespace
 
@@ -467,11 +413,11 @@ void RTCReduce(const NodeAttrs& attrs,
   using namespace mxnet::common::cuda::rtc;
   if (req == kNullOp) return;
   Stream<gpu> *s = ctx.get_stream<gpu>();
-  size_t type_size = util::mshadow_type_info(small.type_flag_).size;
+  size_t type_size = common::mshadow_type_info(small.type_flag_).size;
   if (small.type_flag_ == mshadow::kFloat16) {
     type_size = sizeof(float);
   }
-  RTCReduceImplConfig config(small.shape_, big.shape_, type_size, nullptr, nullptr);
+  ReduceImplConfig config(small.shape_, big.shape_, nullptr, nullptr, type_size);
   if (config.M == 1) {
     // With M == 1 result is just (possibly reshaped) OP(big)
     UnaryRTCCompute {OP} (attrs, ctx, {big}, {req}, {small});
@@ -493,6 +439,49 @@ void RTCReduce(const NodeAttrs& attrs,
   }
 }
 
+void RTCReduce(const NodeAttrs& attrs,
+               const OpContext& ctx,
+               const TBlob& small,
+               const OpReqType req,
+               const Tensor<gpu, 1, char>& workspace,
+               const TBlob& big,
+               const TBlob &lhs,
+               const TBlob &rhs,
+               const std::string& reducer,
+               int ndim,
+               const std::string& OP1,
+               const std::string& OP2) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req == kNullOp) return;
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  size_t type_size = common::mshadow_type_info(small.type_flag_).size;
+  if (small.type_flag_ == mshadow::kFloat16) {
+    type_size = sizeof(float);
+  }
+  ReduceImplConfig config(small.shape_, big.shape_, &lhs.shape_, &rhs.shape_, type_size);
+  std::string common_code = std::string("const OpReqType req = ") +
+                            util::to_string(req) +
+                            ";\n"
+                            "#define OP1 op::" +
+                            OP1 +
+                            "\n"
+                            "#define OP2 op::" +
+                            OP2 +
+                            "\n"
+                            "#define REDUCER " +
+                            reducer +
+                            "\n"
+                            "const int ndim = " +
+                            std::to_string(ndim) +
+                            ";\n";
+  if (config.M == 1) {
+    RTCReduceM1Impl(s, small, big, lhs, rhs, config, ndim, common_code, ctx.run_ctx.ctx.dev_id);
+  } else {
+    RTCReduceImpl(s, small, req == kAddTo, big, workspace, config,
+                  ndim, common_code, ctx.run_ctx.ctx.dev_id, &lhs, &rhs);
+  }
+}
+
 #endif  // MXNET_USE_CUDA
 
 }  // namespace broadcast

From 8174ca234ef516b02570d14e02563766516f4707 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 22 Jun 2020 13:54:01 -0700
Subject: [PATCH 15/64] Remove non-RTC vectorization support

---
 src/common/cuda/vectorization.cuh          | 285 ---------------------
 src/operator/tensor/broadcast_reduce-inl.h |   3 -
 src/operator/tensor/elemwise_binary_op.cuh | 249 ------------------
 src/operator/tensor/elemwise_binary_op.h   |   4 -
 4 files changed, 541 deletions(-)
 delete mode 100644 src/common/cuda/vectorization.cuh
 delete mode 100644 src/operator/tensor/elemwise_binary_op.cuh

diff --git a/src/common/cuda/vectorization.cuh b/src/common/cuda/vectorization.cuh
deleted file mode 100644
index bc7840a7014a..000000000000
--- a/src/common/cuda/vectorization.cuh
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file cuda_vectorization.cuh
- * \brief GPU helpers for vectorized memory accesses
- */
-
-#ifndef MXNET_COMMON_CUDA_VECTORIZATION_CUH_
-#define MXNET_COMMON_CUDA_VECTORIZATION_CUH_
-
-#include <mxnet/base.h>
-
-#if MXNET_USE_CUDA
-
-#include <cuda_runtime.h>
-#include "utils.h"
-
-
-namespace mxnet {
-namespace common {
-namespace cuda {
-
-/* \brief Helper class that enables storing multiple values of type DType
-          as 1 value of type LType.
-*/
-template <typename DType, typename LType>
-class VectorizedStorage {
- public:
-  constexpr static int nvec = sizeof(LType) / sizeof(DType);
-  union vectorized_storage {
-    LType aligned;
-    DType separate[nvec];  // NOLINT(*)
-
-    MSHADOW_XINLINE vectorized_storage() {}
-    MSHADOW_XINLINE ~vectorized_storage() {}
-  } scratch_;
-};
-
-/* \brief Helper class that enables accessing multiple values of type DType
-          as 1 value of type LType. Additional aligned template argument
-          allows performance optimizations if the pointer and the size of
-          the allocation is aligned to sizeof(LType) / sizeof(DType) elements.
-*/
-template <typename DType, typename LType, bool aligned = false>
-class VectorizedAccessor {
- public:
-  using StorageType = VectorizedStorage<typename std::remove_const<DType>::type,
-                                        typename std::remove_const<LType>::type>;
-  StorageType storage_;
-
-  LType* aligned_ptr_;
-  DType* unaligned_ptr_;
-  int alignment_;
-  index_t n_elems_;
-
-  MSHADOW_XINLINE VectorizedAccessor(DType* ptr, const index_t size) {
-    unaligned_ptr_ = ptr;
-    if (aligned) {
-      alignment_ = 0;
-      aligned_ptr_ = reinterpret_cast<LType*>(ptr);
-      n_elems_ = (size + storage_.nvec - 1) / storage_.nvec;
-    } else {
-      size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
-      alignment_ = (ptr_as_number % sizeof(LType)) / sizeof(DType);
-      aligned_ptr_ = reinterpret_cast<LType*>(ptr - alignment_);
-      n_elems_ = (size + alignment_ + storage_.nvec - 1) / storage_.nvec;
-    }
-  }
-
-  /* \brief Alignment of the input pointer in elements. */
-  MSHADOW_XINLINE int alignment() const {
-    return alignment_;
-  }
-
-  /* \brief Access to separate elements. */
-  MSHADOW_XINLINE DType* separate() {
-    return storage_.scratch_.separate;
-  }
-
-  /* \brief Number of elements stored. */
-  MSHADOW_XINLINE constexpr int nvec() const {
-    return storage_.nvec;
-  }
-
-  /* \brief Number of aligned elements that span the entire input tensor. */
-  MSHADOW_XINLINE index_t num_aligned_elements() const {
-    return n_elems_;
-  }
-
-  /* \brief Load values from the input.
-     \param id Aligned index of the element.
-     \param N size of the tensor.
-  */
-  MSHADOW_XINLINE void load(const index_t id, const index_t N) {
-    if (aligned) {
-      storage_.scratch_.aligned = aligned_ptr_[id];
-    } else {
-      if (id > 0 && id < n_elems_ - 1) {
-        storage_.scratch_.aligned = aligned_ptr_[id];
-      } else {
-#pragma unroll
-        for (int j = 0; j < storage_.nvec; ++j) {
-          DType* ptr = reinterpret_cast<DType*>(&(aligned_ptr_[id])) + j;
-          if (reinterpret_cast<size_t>(ptr) >= reinterpret_cast<size_t>(unaligned_ptr_) &&
-              reinterpret_cast<size_t>(ptr) < reinterpret_cast<size_t>(unaligned_ptr_ + N)) {
-            storage_.scratch_.separate[j] = *ptr;
-          }
-        }
-      }
-    }
-  }
-};
-
-/* \brief Class used for vectorized read-only access. */
-template <typename DType, typename LType, bool aligned = false>
-class VectorizedLoader : public VectorizedAccessor<const DType, const LType, aligned> {
- public:
-  MSHADOW_XINLINE VectorizedLoader(const DType* ptr, const index_t N) :
-    VectorizedAccessor<const DType, const LType, aligned>(ptr, N) {
-  }
-};
-
-/* \brief Class used for vectorized writable access. */
-template <typename DType, typename LType, bool aligned = false>
-class VectorizedStorer : public VectorizedAccessor<DType, LType, aligned> {
- public:
-  MSHADOW_XINLINE VectorizedStorer(DType* ptr, const index_t N) :
-    VectorizedAccessor<DType, LType, aligned>(ptr, N) {
-  }
-
-  /* \brief Store values to the output.
-     \param id Aligned index of the element.
-     \param N size of the tensor.
-  */
-  MSHADOW_XINLINE void store(const index_t id, const index_t N) {
-    if (aligned) {
-      this->aligned_ptr_[id] = this->storage_.scratch_.aligned;
-    } else {
-      if (id > 0 && id < this->n_elems_ - 1) {
-        this->aligned_ptr_[id] = this->storage_.scratch_.aligned;
-      } else {
-#pragma unroll
-        for (int j = 0; j < this->storage_.nvec; ++j) {
-          DType* ptr = reinterpret_cast<DType*>(&(this->aligned_ptr_[id])) + j;
-          if (reinterpret_cast<size_t>(ptr) >= reinterpret_cast<size_t>(this->unaligned_ptr_) &&
-              reinterpret_cast<size_t>(ptr) < reinterpret_cast<size_t>(this->unaligned_ptr_ + N)) {
-            *ptr = this->storage_.scratch_.separate[j];
-          }
-        }
-      }
-    }
-  }
-};
-
-namespace {
-
-enum class Alignment {
-  SAME_ALIGNED,  // All tensors aligned
-  SAME_UNALIGNED,  // All tensors have the same misalignment
-  DIFFERENT  // Tensors have different alignment
-};
-
-template <typename LType, typename DType>
-int CalcAlignment(const DType* ptr) {
-  size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
-  return ptr_as_number % sizeof(LType);
-}
-
-/* \brief Check alignment of the inputs and outputs when cast to LType*.
-   \param params Structuce containing arrays with inputs' and outputs' pointers
-   \param lead_dim Leading dimension of the tensors.
-   \param other_dim The size of the other dimensions of the tensors.
-*/
-template <typename LType, typename DType, typename Params>
-Alignment CheckAlignment(const Params& params, const index_t lead_dim, const index_t other_dim) {
-  int align = -1;
-  constexpr int nvec = sizeof(LType) / sizeof(DType);
-
-  for (const DType* ptr : params.inputs) {
-    int new_align = CalcAlignment<LType>(ptr);
-    if (align == -1) {
-      align = new_align;
-    } else {
-      if (align != new_align) {
-        return Alignment::DIFFERENT;
-      }
-    }
-  }
-
-  for (const DType* ptr : params.outputs) {
-    int new_align = CalcAlignment<LType>(ptr);
-    if (align == -1) {
-      align = new_align;
-    } else {
-      if (align != new_align) {
-        return Alignment::DIFFERENT;
-      }
-    }
-  }
-
-  if ((other_dim != 1) &&
-      (lead_dim % nvec != 0)) {
-    return Alignment::DIFFERENT;
-  }
-
-  if ((align == 0) &&
-      (lead_dim % nvec == 0)) {
-    return Alignment::SAME_ALIGNED;
-  } else {
-    return Alignment::SAME_UNALIGNED;
-  }
-}
-
-constexpr int vectorized_kernel_thread_num = 512;
-
-}  // namespace
-
-/* \brief Helper launcher function for the vectorized kernels. Checks for alignment of the
-          input and output tensors and launches a proper template.
-   \param lead_dim Leading dimension of the tensors.
-   \param other_dim The size of the other dimensions.
-   \param s Stream which should be used for launching the kernel.
-   \param params Input parameters to the kernel. Needs to contain at least 2 arrays of DType*:
-                 inputs and outputs, which contain input and output pointers.
-*/
-template <typename DType, typename LType, typename Kernel>
-void VectorizedKernelLauncher(const index_t lead_dim,
-                              const index_t other_dim,
-                              mshadow::Stream<gpu>* s,
-                              typename Kernel::ParamType params) {
-  static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type");
-  if (lead_dim * other_dim != 0) {
-    cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
-    VectorizedLoader<DType, LType> l(params.inputs[0], lead_dim);
-    size_t num_elements = other_dim * l.num_aligned_elements();
-    constexpr int threads = vectorized_kernel_thread_num;
-    constexpr int max_blocks = 65535;
-    index_t blocks = std::min(static_cast<int>((num_elements + threads - 1) / threads),
-                              max_blocks);
-    auto align = CheckAlignment<LType, DType>(params, lead_dim, other_dim);
-    switch (align) {
-      case Alignment::SAME_ALIGNED:
-        Kernel::template Launch<true, LType>(blocks, threads, stream, params, lead_dim, other_dim);
-        break;
-      case Alignment::SAME_UNALIGNED:
-        Kernel::template Launch<false, LType>(blocks, threads, stream, params, lead_dim, other_dim);
-        break;
-      case Alignment::DIFFERENT: {
-        const index_t size = lead_dim * other_dim;
-        index_t blocks = std::min(static_cast<int>((size + threads - 1) /
-                                                   threads),
-                                  max_blocks);
-        // If the pointers are aligned differently we cannot vectorize
-        Kernel::template Launch<true, DType>(blocks, threads, stream, params, lead_dim, other_dim);
-        break;
-      }
-    }
-  }
-}
-
-}  // namespace cuda
-}  // namespace common
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA
-
-#endif  // MXNET_COMMON_CUDA_VECTORIZATION_CUH_
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index b263c3b01454..d2438103cf27 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -33,9 +33,6 @@
 #include "../mshadow_op.h"
 #include "../mxnet_op.h"
 #include "../operator_common.h"
-#if MXNET_USE_CUDA
-#include "../../common/cuda/vectorization.cuh"
-#endif
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh
deleted file mode 100644
index 88612f1546a0..000000000000
--- a/src/operator/tensor/elemwise_binary_op.cuh
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file elemwise_binary_op.cuh
- * \brief GPU helpers for elementwise operators
- */
-
-#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
-#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
-
-#include <cuda_runtime.h>
-#include "../operator_common.h"
-#include "../../common/cuda/vectorization.cuh"
-
-#include <vector>
-
-#if MXNET_USE_CUDA
-
-namespace mxnet {
-namespace op {
-
-namespace binary {
-
-using common::cuda::VectorizedKernelLauncher;
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
-
-template <typename DType, int NumInputs, int NumOutputs>
-struct VectorizedBinaryKernelParams {
-  const DType* inputs[NumInputs];
-  DType* outputs[NumOutputs];
-};
-
-template <bool aligned, typename DType, typename LType,
-          typename LOP, typename ROP, int lreq, int rreq>
-__global__ void VectorizedBinaryKernelBwdUseNone(
-    const VectorizedBinaryKernelParams<DType, 1, 2> params,
-    const index_t N) {
-  VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], N);
-  VectorizedStorer<DType, LType, aligned> lstorer(params.outputs[0], N);
-  VectorizedStorer<DType, LType, aligned> rstorer(params.outputs[1], N);
-
-  const index_t M = loader.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader.load(tid, N);
-    if (lreq == kAddTo) {
-      lstorer.load(tid, N);
-    }
-    if (rreq == kAddTo) {
-      rstorer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < loader.nvec(); ++i) {
-      DType inp = loader.separate()[i];
-      if (!((std::is_same<LOP, mshadow_op::identity>::value && lreq == kWriteInplace) ||
-            lreq == kNullOp)) {
-        DType ltemp = LOP::Map(inp);
-        if (lreq == kAddTo) {
-          lstorer.separate()[i] += ltemp;
-        } else {
-          lstorer.separate()[i] = ltemp;
-        }
-        lstorer.store(tid, N);
-      }
-      if (!((std::is_same<ROP, mshadow_op::identity>::value && rreq == kWriteInplace) ||
-            rreq == kNullOp)) {
-        DType rtemp = ROP::Map(inp);
-
-        if (rreq == kAddTo) {
-          rstorer.separate()[i] += rtemp;
-        } else {
-          rstorer.separate()[i] = rtemp;
-        }
-        rstorer.store(tid, N);
-      }
-    }
-  }
-}
-
-template <bool aligned, typename DType, typename LType,
-          typename LOP, typename ROP, int lreq, int rreq>
-__global__ void VectorizedBinaryKernelBwdUseIn(
-    const VectorizedBinaryKernelParams<DType, 3, 2> params,
-    const index_t N) {
-  VectorizedLoader<DType, LType, aligned> ograd_loader(params.inputs[0], N);
-  VectorizedLoader<DType, LType, aligned> linput_loader(params.inputs[1], N);
-  VectorizedLoader<DType, LType, aligned> rinput_loader(params.inputs[2], N);
-  VectorizedStorer<DType, LType, aligned> lstorer(params.outputs[0], N);
-  VectorizedStorer<DType, LType, aligned> rstorer(params.outputs[1], N);
-
-  const index_t M = ograd_loader.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    ograd_loader.load(tid, N);
-    linput_loader.load(tid, N);
-    rinput_loader.load(tid, N);
-    if (lreq == kAddTo) {
-      lstorer.load(tid, N);
-    }
-    if (rreq == kAddTo) {
-      rstorer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < ograd_loader.nvec(); ++i) {
-      DType ograd = ograd_loader.separate()[i];
-      DType linput = linput_loader.separate()[i];
-      DType rinput = rinput_loader.separate()[i];
-      if (!(lreq == kNullOp)) {
-        DType ltemp = ograd * LOP::Map(linput, rinput);
-        if (lreq == kAddTo) {
-          lstorer.separate()[i] += ltemp;
-        } else {
-          lstorer.separate()[i] = ltemp;
-        }
-        lstorer.store(tid, N);
-      }
-      if (!(rreq == kNullOp)) {
-        DType rtemp = ograd * ROP::Map(linput, rinput);
-
-        if (rreq == kAddTo) {
-          rstorer.separate()[i] += rtemp;
-        } else {
-          rstorer.separate()[i] = rtemp;
-        }
-        rstorer.store(tid, N);
-      }
-    }
-  }
-}
-
-template <typename DType, typename LOP, typename ROP, int lreq, int rreq>
-class VectorizedBinaryBwdUseNone {
- public:
-  using ParamType = VectorizedBinaryKernelParams<DType, 1, 2>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryKernelBwdUseNone<aligned, DType, LType, LOP, ROP, lreq, rreq>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-template <typename DType, typename LOP, typename ROP, int lreq, int rreq>
-class VectorizedBinaryBwdUseIn {
- public:
-  using ParamType = VectorizedBinaryKernelParams<DType, 3, 2>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryKernelBwdUseIn<aligned, DType, LType, LOP, ROP, lreq, rreq>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-}  // namespace binary
-
-template<typename LOP, typename ROP>
-void ElemwiseBinaryOp::BackwardUseNone_(const nnvm::NodeAttrs &attrs,
-                                        mshadow::Stream<gpu>* s,
-                                        const std::vector<TBlob> &inputs,
-                                        const std::vector<OpReqType> &req,
-                                        const std::vector<TBlob> &outputs) {
-  using namespace binary;
-  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
-
-  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    const index_t size = inputs[0].Size();
-    if (req[0] != kNullOp || req[1] != kNullOp) {
-      MXNET_REQ_TYPE_SWITCH(req[0], lreq, {
-        MXNET_REQ_TYPE_SWITCH(req[1], rreq, {
-          using LType = uint4;
-          using Kernel = VectorizedBinaryBwdUseNone<DType, LOP, ROP, lreq, rreq>;
-
-          typename Kernel::ParamType params;
-          params.inputs[0] = inputs[0].dptr<DType>();
-          params.outputs[0] = outputs[0].dptr<DType>();
-          params.outputs[1] = outputs[1].dptr<DType>();
-
-          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-        });
-      });
-    }
-  });
-}
-
-template<typename LOP, typename ROP>
-void ElemwiseBinaryOp::BackwardUseIn_(const nnvm::NodeAttrs &attrs,
-                                      mshadow::Stream<gpu>* s,
-                                      const std::vector<TBlob> &inputs,
-                                      const std::vector<OpReqType> &req,
-                                      const std::vector<TBlob> &outputs) {
-  using namespace binary;
-  if (req[0] != kNullOp || req[1] != kNullOp) {
-    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      MXNET_REQ_TYPE_SWITCH(req[0], lreq, {
-        MXNET_REQ_TYPE_SWITCH(req[1], rreq, {
-          const index_t size = inputs[0].Size();
-          // Using 64 bit loads to reduce register pressure
-          using LType = uint2;
-          using Kernel = VectorizedBinaryBwdUseIn<DType, LOP, ROP, lreq, rreq>;
-
-          typename Kernel::ParamType params;
-          params.inputs[0] = inputs[0].dptr<DType>();
-          params.inputs[1] = inputs[1].dptr<DType>();
-          params.inputs[2] = inputs[2].dptr<DType>();
-          params.outputs[0] = outputs[0].dptr<DType>();
-          params.outputs[1] = outputs[1].dptr<DType>();
-
-          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-        });
-      });
-    });
-  }
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA
-#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 156a88955550..d1f73e9bb6ec 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -904,8 +904,4 @@ struct ElemwiseBinaryRTCBwdUseIn {
 }  // namespace op
 }  // namespace mxnet
 
-#ifdef __CUDACC__
-#include "elemwise_binary_op.cuh"
-#endif  // __CUDACC__
-
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_H_

From 3a7541193876260edc0a62cec5e191e827e29b7f Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 23 Jun 2020 13:38:43 -0700
Subject: [PATCH 16/64] Remove template from ReduceWorkspaceSize

---
 src/operator/nn/group_norm-inl.h              | 32 ++++++++-----------
 src/operator/nn/layer_norm-inl.h              | 32 ++++++++-----------
 .../linalg/broadcast_reduce_op_customized.h   |  4 +--
 src/operator/numpy/np_broadcast_reduce_op.h   | 12 +++----
 src/operator/numpy/np_cross-inl.h             |  7 ++--
 src/operator/numpy/np_elemwise_broadcast_op.h |  6 ++--
 src/operator/numpy/np_where_op-inl.h          | 12 +++----
 src/operator/numpy/random/np_exponential_op.h |  2 +-
 .../numpy/random/np_location_scale_op.h       | 10 +++---
 src/operator/numpy/random/np_normal_op.h      | 10 +++---
 src/operator/numpy/random/np_pareto_op.h      |  2 +-
 src/operator/numpy/random/np_rayleigh_op.h    |  2 +-
 src/operator/numpy/random/np_weibull_op.h     |  2 +-
 .../quantization/quantization_utils.h         |  2 +-
 src/operator/random/pdf_op.h                  |  2 +-
 src/operator/tensor/broadcast_reduce-inl.h    | 24 ++++++--------
 src/operator/tensor/broadcast_reduce_op.h     |  8 ++---
 .../tensor/elemwise_binary_broadcast_op.cc    |  4 +--
 .../tensor/elemwise_binary_broadcast_op.h     |  4 +--
 19 files changed, 77 insertions(+), 100 deletions(-)

diff --git a/src/operator/nn/group_norm-inl.h b/src/operator/nn/group_norm-inl.h
index adb22e20244b..7cd2ca8997e3 100644
--- a/src/operator/nn/group_norm-inl.h
+++ b/src/operator/nn/group_norm-inl.h
@@ -115,11 +115,9 @@ void GroupNormCompute(const nnvm::NodeAttrs& attrs,
 
   size_t workspace_size = 0;
   MSHADOW_REAL_TYPE_SWITCH(data.type_flag_, DType, {
-    BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-      workspace_size =
-        broadcast::ReduceWorkspaceSize<NDim>(s, red_dst_shape, req[0],
-                                             red_src_shape, sizeof(DType));
-    });
+    workspace_size =
+      broadcast::ReduceWorkspaceSize(s, red_dst_shape, req[0],
+                                     red_src_shape, sizeof(DType));
   });
 
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
@@ -280,20 +278,16 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
     // There are two types of reduction workloads: reduce over axis and reduce exclude axis
     // We take the maximum of the workspace sizes required by these workloads.
     // Also, we explicitly set the req_type=kAddto in case we want to use it.
-    BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-      reduce_workspace_size =
-        std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim>(s, red_dst_shape,
-                                                      kAddTo, red_src_shape,
-                                                      sizeof(DType)));
-    });
-    BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
-      reduce_workspace_size =
-        std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim>(s, red_exclude_dst_shape, kAddTo,
-                                                      red_exclude_src_shape,
-                                                      sizeof(DType)));
-    });
+    reduce_workspace_size =
+      std::max(reduce_workspace_size,
+               broadcast::ReduceWorkspaceSize(s, red_dst_shape,
+                                              kAddTo, red_src_shape,
+                                              sizeof(DType)));
+    reduce_workspace_size =
+      std::max(reduce_workspace_size,
+               broadcast::ReduceWorkspaceSize(s, red_exclude_dst_shape, kAddTo,
+                                              red_exclude_src_shape,
+                                              sizeof(DType)));
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(
     Shape1(reduce_workspace_size + data_size * 2 + red_out_size), s);
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index 49ac52dc6a7c..e6e5a41f0fac 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -108,11 +108,9 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
   Tensor<xpu, 1, char> workspace;
   size_t workspace_size = 0;
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-      workspace_size =
-        broadcast::ReduceWorkspaceSize<NDim>(s, mean_data.shape_, req[0],
-                                             in_data.shape_, sizeof(DType));
-    });
+    workspace_size =
+      broadcast::ReduceWorkspaceSize(s, mean_data.shape_, req[0],
+                                     in_data.shape_, sizeof(DType));
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
 
@@ -256,20 +254,16 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
     // There are two types of reduction workloads: reduce over axis and reduce exclude axis
     // We take the maximum of the workspace sizes required by these workloads.
     // Also, we explicitly set the req_type=kAddto in case we want to use it.
-    BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-      reduce_workspace_size =
-        std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim>(s, red_dst_shape,
-                                                      kAddTo, red_src_shape,
-                                                      sizeof(DType)));
-    });
-    BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
-      reduce_workspace_size =
-        std::max(reduce_workspace_size,
-                 broadcast::ReduceWorkspaceSize<NDim>(s, red_exclude_dst_shape, kAddTo,
-                                                      red_exclude_src_shape,
-                                                      sizeof(DType)));
-    });
+    reduce_workspace_size =
+      std::max(reduce_workspace_size,
+               broadcast::ReduceWorkspaceSize(s, red_dst_shape,
+                                              kAddTo, red_src_shape,
+                                              sizeof(DType)));
+    reduce_workspace_size =
+      std::max(reduce_workspace_size,
+               broadcast::ReduceWorkspaceSize(s, red_exclude_dst_shape, kAddTo,
+                                              red_exclude_src_shape,
+                                              sizeof(DType)));
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(
     Shape1(reduce_workspace_size + data_size * 2 + red_out_size), s);
diff --git a/src/operator/numpy/linalg/broadcast_reduce_op_customized.h b/src/operator/numpy/linalg/broadcast_reduce_op_customized.h
index 25f66d04f663..8e1c0b3db18d 100644
--- a/src/operator/numpy/linalg/broadcast_reduce_op_customized.h
+++ b/src/operator/numpy/linalg/broadcast_reduce_op_customized.h
@@ -51,8 +51,8 @@ void ReduceAxesComputeImplWithReducer(const OpContext& ctx,
       const TBlob in_data = inputs[0].reshape(src_shape);
       const TBlob out_data = outputs[0].reshape(dst_shape);
       BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-        size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, OType>(
-            s, out_data.shape_, req[0], in_data.shape_);
+        size_t workspace_size = broadcast::ReduceWorkspaceSize(
+            s, out_data.shape_, req[0], in_data.shape_, sizeof(OType));
         Tensor<xpu, 1, char> workspace =
             ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
         broadcast::ReduceWithReducer<Reducer, NDim, OType, OP, safe_acc>(
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 205fb621d4e8..eef6188b1283 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -820,10 +820,8 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
     TShape src_shape, dst_shape;
     BroadcastReduceShapeCompact(data.shape_, small1, &src_shape, &dst_shape);
     size_t workspace_size = 0;
-    MXNET_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-      workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
-        s, dst_shape, {kWriteTo}, src_shape);
-    });
+    workspace_size = broadcast::ReduceWorkspaceSize(
+      s, dst_shape, {kWriteTo}, src_shape, sizeof(DType));
     size_t temp_mem_size = temp_data_size + temp_sum_size + workspace_size;
     Tensor<xpu, 1, char> temp_mem =
     ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(temp_mem_size), s);
@@ -993,10 +991,8 @@ void NumpyMomentsForward(const nnvm::NodeAttrs& attrs,
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
       // Get workspace and temp space for data - mean
       size_t workspace_size = 0;
-      BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-        workspace_size = broadcast::ReduceWorkspaceSize<NDim>(
-          s, dst_shape, req[0], src_shape, sizeof(DType));
-      });
+      workspace_size = broadcast::ReduceWorkspaceSize(
+        s, dst_shape, req[0], src_shape, sizeof(DType));
       size_t temp_data_size = data.shape_.Size() * sizeof(DType);
       size_t temp_mem_size = temp_data_size + workspace_size;
       Tensor<xpu, 1, char> temp_mem =
diff --git a/src/operator/numpy/np_cross-inl.h b/src/operator/numpy/np_cross-inl.h
index c2092bbfec23..cb6bbdb1f137 100644
--- a/src/operator/numpy/np_cross-inl.h
+++ b/src/operator/numpy/np_cross-inl.h
@@ -659,10 +659,9 @@ struct ReduceImplWrap {
     size_t ws_reduce = 0U;
     std::vector<int> reduce_axis = GetReduceAxis(out_move_shape, in_move_shape);
     if (reduce_axis.empty() || req == kNullOp) { return 0U; }
-    SUM_NDIM_SWITCH(out_shape.ndim(), NDim, {
-      ws_reduce = broadcast::ReduceWorkspaceSize<NDim, DType>(ctx.get_stream<xpu>(),
-                                                              out_shape, req, in_shape);
-    });
+    ws_reduce = broadcast::ReduceWorkspaceSize(ctx.get_stream<xpu>(),
+                                               out_shape, req, in_shape,
+                                               sizeof(DType));
     return ws_reduce;
   }
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.h b/src/operator/numpy/np_elemwise_broadcast_op.h
index f813289cda6d..1fa58908a113 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.h
+++ b/src/operator/numpy/np_elemwise_broadcast_op.h
@@ -412,12 +412,10 @@ void NumpyBinaryBackwardUseIn(const nnvm::NodeAttrs& attrs,
 
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, OType, {
       if (need_bc) {
-        BROADCAST_NDIM_SWITCH(new_oshape.ndim(), ndim, {
-          workspace_size_l = ReduceWorkspaceSize<ndim>(
+          workspace_size_l = ReduceWorkspaceSize(
             s, new_lshape, req[0], new_oshape, new_lshape, new_rshape, sizeof(OType));
-          workspace_size_r = ReduceWorkspaceSize<ndim>(
+          workspace_size_r = ReduceWorkspaceSize(
             s, new_rshape, req[1], new_oshape, new_lshape, new_rshape, sizeof(OType));
-        });
       }
       size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
       size_t cast_tensor_size = tensor_size * sizeof(OType);
diff --git a/src/operator/numpy/np_where_op-inl.h b/src/operator/numpy/np_where_op-inl.h
index 3233f785d246..10ec081b2a8f 100644
--- a/src/operator/numpy/np_where_op-inl.h
+++ b/src/operator/numpy/np_where_op-inl.h
@@ -225,10 +225,10 @@ inline void NumpyWhereOpBackward(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, broadcast::MAX_DIM, DType> workspace;
       size_t ws_size = 0;
       if (ograd.shape_ != dx.shape_ || ograd.shape_ != dy.shape_) {
-        size_t ws_size1 = broadcast::ReduceWorkspaceSize<broadcast::MAX_DIM, DType>(
-            s, expanded_lshape, req[0], expanded_oshape);
-        size_t ws_size2 = broadcast::ReduceWorkspaceSize<broadcast::MAX_DIM, DType>(
-            s, expanded_rshape, req[1], expanded_oshape);
+        size_t ws_size1 = broadcast::ReduceWorkspaceSize(
+            s, expanded_lshape, req[0], expanded_oshape, sizeof(DType));
+        size_t ws_size2 = broadcast::ReduceWorkspaceSize(
+            s, expanded_rshape, req[1], expanded_oshape, sizeof(DType));
         ws_size = std::max(ws_size1, ws_size2);
       }
       // process left output
@@ -366,8 +366,8 @@ inline void NumpyWhereScalarOpBackward(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, broadcast::MAX_DIM, DType> workspace;
       size_t ws_size = 0;
       if (ograd.shape_ != dx.shape_) {
-        ws_size = broadcast::ReduceWorkspaceSize<broadcast::MAX_DIM, DType>(
-            s, expanded_lshape, req[0], expanded_oshape);
+        ws_size = broadcast::ReduceWorkspaceSize(s, expanded_lshape, req[0],
+                                                 expanded_oshape, sizeof(DType));
       }
       // If lscalar, then process right output, `is_left` should be false
       if (ograd.shape_ == dx.shape_) {
diff --git a/src/operator/numpy/random/np_exponential_op.h b/src/operator/numpy/random/np_exponential_op.h
index 36d29ff842e3..203430dd5879 100644
--- a/src/operator/numpy/random/np_exponential_op.h
+++ b/src/operator/numpy/random/np_exponential_op.h
@@ -171,7 +171,7 @@ inline void ExponentialReparamBackwardImpl(const OpContext& ctx,
   const TBlob samples = inputs[3].reshape(new_oshape);
   const TBlob noise = inputs[4].reshape(new_oshape);
   size_t workspace_size =
-      ReduceWorkspaceSize<ndim, DType>(s, igrad.shape_, req[0], ograd.shape_);
+      ReduceWorkspaceSize(s, igrad.shape_, req[0], ograd.shape_, sizeof(DType));
   Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
   Reduce<red::sum, ndim, DType, op::mshadow_op::mul, op::mshadow_op::left>(
diff --git a/src/operator/numpy/random/np_location_scale_op.h b/src/operator/numpy/random/np_location_scale_op.h
index 00c89c149c5c..49bcbc3d0413 100644
--- a/src/operator/numpy/random/np_location_scale_op.h
+++ b/src/operator/numpy/random/np_location_scale_op.h
@@ -296,10 +296,10 @@ inline void LocationScaleReparamBackwardImpl(const OpContext& ctx,
   const TBlob rhs = inputs[3].reshape(new_rshape);
   const TBlob samples = inputs[4].reshape(new_oshape);
   const TBlob noise = inputs[5].reshape(new_oshape);
-  size_t workspace_size_l = ReduceWorkspaceSize<ndim, DType>(
-    s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_, rhs.shape_);
-  size_t workspace_size_r = ReduceWorkspaceSize<ndim, DType>(
-    s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_, rhs.shape_);
+  size_t workspace_size_l = ReduceWorkspaceSize(
+    s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_, rhs.shape_, sizeof(DType));
+  size_t workspace_size_r = ReduceWorkspaceSize(
+    s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_, rhs.shape_, sizeof(DType));
   size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
   Tensor<xpu, 1, char> workspace =
     ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
@@ -329,7 +329,7 @@ inline void ScalarLocationScaleReparamBackwardImpl(const OpContext& ctx,
   const TBlob samples = inputs[3].reshape(new_oshape);
   const TBlob noise = inputs[4].reshape(new_oshape);
   size_t workspace_size =
-    ReduceWorkspaceSize<ndim, DType>(s, igrad.shape_, req[0], ograd.shape_);
+    ReduceWorkspaceSize(s, igrad.shape_, req[0], ograd.shape_, sizeof(DType));
   Tensor<xpu, 1, char> workspace =
     ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
   if (loc_is_tensor) {
diff --git a/src/operator/numpy/random/np_normal_op.h b/src/operator/numpy/random/np_normal_op.h
index d81f3d38f3a3..332200dc6cd2 100644
--- a/src/operator/numpy/random/np_normal_op.h
+++ b/src/operator/numpy/random/np_normal_op.h
@@ -261,10 +261,10 @@ inline void NormalReparamBackwardImpl(const OpContext& ctx,
   const TBlob rhs = inputs[3].reshape(new_rshape);
   const TBlob samples = inputs[4].reshape(new_oshape);
   const TBlob noise = inputs[5].reshape(new_oshape);
-  size_t workspace_size_l = ReduceWorkspaceSize<ndim, DType>(
-      s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_, rhs.shape_);
-  size_t workspace_size_r = ReduceWorkspaceSize<ndim, DType>(
-      s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_, rhs.shape_);
+  size_t workspace_size_l = ReduceWorkspaceSize(
+      s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_, rhs.shape_, sizeof(DType));
+  size_t workspace_size_r = ReduceWorkspaceSize(
+      s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_, rhs.shape_, sizeof(DType));
   size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
   Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
@@ -294,7 +294,7 @@ inline void ScalarNormalReparamBackwardImpl(const OpContext& ctx,
   const TBlob samples = inputs[3].reshape(new_oshape);
   const TBlob noise = inputs[4].reshape(new_oshape);
   size_t workspace_size =
-      ReduceWorkspaceSize<ndim, DType>(s, igrad.shape_, req[0], ograd.shape_);
+      ReduceWorkspaceSize(s, igrad.shape_, req[0], ograd.shape_, sizeof(DType));
   Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
   if (loc_is_tensor) {
diff --git a/src/operator/numpy/random/np_pareto_op.h b/src/operator/numpy/random/np_pareto_op.h
index a8a5d7f411c0..af0e6c568187 100644
--- a/src/operator/numpy/random/np_pareto_op.h
+++ b/src/operator/numpy/random/np_pareto_op.h
@@ -174,7 +174,7 @@ inline void ScalarParetoReparamBackwardImpl(const OpContext& ctx,
   const TBlob samples = inputs[3].reshape(new_oshape);
   const TBlob noise = inputs[4].reshape(new_oshape);
   size_t workspace_size =
-      ReduceWorkspaceSize<ndim, DType>(s, igrad.shape_, req[0], ograd.shape_);
+      ReduceWorkspaceSize(s, igrad.shape_, req[0], ograd.shape_, sizeof(DType));
   Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
   Reduce<red::sum, ndim, DType, op::mshadow_op::mul, op::mshadow_op::left>(
diff --git a/src/operator/numpy/random/np_rayleigh_op.h b/src/operator/numpy/random/np_rayleigh_op.h
index 3444f3b74af5..0bbaf5d7158b 100644
--- a/src/operator/numpy/random/np_rayleigh_op.h
+++ b/src/operator/numpy/random/np_rayleigh_op.h
@@ -172,7 +172,7 @@ inline void ScalarRayleighReparamBackwardImpl(const OpContext& ctx,
   const TBlob samples = inputs[3].reshape(new_oshape);
   const TBlob noise = inputs[4].reshape(new_oshape);
   size_t workspace_size =
-      ReduceWorkspaceSize<ndim, DType>(s, igrad.shape_, req[0], ograd.shape_);
+      ReduceWorkspaceSize(s, igrad.shape_, req[0], ograd.shape_, sizeof(DType));
   Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
   Reduce<red::sum, ndim, DType, op::mshadow_op::mul, op::mshadow_op::left>(
diff --git a/src/operator/numpy/random/np_weibull_op.h b/src/operator/numpy/random/np_weibull_op.h
index ff4c40ae8db5..74aeeff9f1fc 100644
--- a/src/operator/numpy/random/np_weibull_op.h
+++ b/src/operator/numpy/random/np_weibull_op.h
@@ -174,7 +174,7 @@ inline void ScalarWeibullReparamBackwardImpl(const OpContext& ctx,
   const TBlob samples = inputs[3].reshape(new_oshape);
   const TBlob noise = inputs[4].reshape(new_oshape);
   size_t workspace_size =
-      ReduceWorkspaceSize<ndim, DType>(s, igrad.shape_, req[0], ograd.shape_);
+      ReduceWorkspaceSize(s, igrad.shape_, req[0], ograd.shape_, sizeof(DType));
   Tensor<xpu, 1, char> workspace =
       ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
   Reduce<red::sum, ndim, DType, op::mshadow_op::mul, op::mshadow_op::left>(
diff --git a/src/operator/quantization/quantization_utils.h b/src/operator/quantization/quantization_utils.h
index 1432baa39b1b..2c5c1ebe1fd3 100644
--- a/src/operator/quantization/quantization_utils.h
+++ b/src/operator/quantization/quantization_utils.h
@@ -184,7 +184,7 @@ inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
   CHECK_EQ(src_shape->ndim(), NDim);
   CHECK_EQ(dst_shape->ndim(), NDim);
 
-  return broadcast::ReduceWorkspaceSize<NDim>(s, *dst_shape, kWriteTo, *src_shape, sizeof(DType));
+  return broadcast::ReduceWorkspaceSize(s, *dst_shape, kWriteTo, *src_shape, sizeof(DType));
 }
 
 enum QuantizeOutType { kAuto = 0, kInt8, kUint8 };
diff --git a/src/operator/random/pdf_op.h b/src/operator/random/pdf_op.h
index fdba85f1101a..57bddfc2b1fe 100644
--- a/src/operator/random/pdf_op.h
+++ b/src/operator/random/pdf_op.h
@@ -588,7 +588,7 @@ void PdfOpBackward(const nnvm::NodeAttrs& attrs,
   const TShape src_shape(Shape2(N, outputs[0].Size() / N)), dst_shape(Shape2(N, 1));
   // Inputs to PdfOpBackward: grad, samples, parm1, parm2, pdf.
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    const size_t red_work_size(broadcast::ReduceWorkspaceSize<2>(
+    const size_t red_work_size(broadcast::ReduceWorkspaceSize(
             s, dst_shape, kAddTo, src_shape, sizeof(DType)));
     const size_t tmp_size(outputs[0].Size() * pnum * sizeof(DType) + red_work_size);
     Tensor<xpu, 1, char> tmp_space =
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index d2438103cf27..bda0434f6005 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -401,16 +401,14 @@ void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
     small.shape_.get<ndim>(), rshape, rstride, ws_dptr);
 }
 
-template<int ndim>
-size_t ReduceWorkspaceSize(Stream<cpu> *s, const mxnet::TShape& small, const OpReqType req,
-                           const mxnet::TShape& big, const int type_size) {
+inline size_t ReduceWorkspaceSize(Stream<cpu> *s, const mxnet::TShape& small, const OpReqType req,
+                                  const mxnet::TShape& big, const int type_size) {
   return 0;
 }
 
-template<int ndim>
-size_t ReduceWorkspaceSize(Stream<cpu> *s, const mxnet::TShape& small, const OpReqType req,
-                           const mxnet::TShape& big, const mxnet::TShape& lhs,
-                           const mxnet::TShape& rhs, const int type_size) {
+inline size_t ReduceWorkspaceSize(Stream<cpu> *s, const mxnet::TShape& small, const OpReqType req,
+                                  const mxnet::TShape& big, const mxnet::TShape& lhs,
+                                  const mxnet::TShape& rhs, const int type_size) {
   return 0;
 }
 
@@ -637,18 +635,16 @@ struct ReduceImplConfig {
   }
 };
 
-template<int ndim>
-size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
-                           const ::mxnet::TShape& big, const int type_size) {
+inline size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
+                                  const ::mxnet::TShape& big, const int type_size) {
   if (req == kNullOp) return 0;
   ReduceImplConfig config(small, big, nullptr, nullptr, type_size);
   return config.workspace_size;
 }
 
-template<int ndim>
-size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
-                           const ::mxnet::TShape& big, const ::mxnet::TShape& lhs,
-                           const ::mxnet::TShape& rhs, const int type_size) {
+inline size_t ReduceWorkspaceSize(Stream<gpu> *s, const ::mxnet::TShape& small, const OpReqType req,
+                                  const ::mxnet::TShape& big, const ::mxnet::TShape& lhs,
+                                  const ::mxnet::TShape& rhs, const int type_size) {
   if (req == kNullOp) return 0;
   ReduceImplConfig config(small, big, &lhs, &rhs, type_size);
   return config.workspace_size;
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 35624287b5a0..5643ab4fd851 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -633,8 +633,8 @@ void ReduceAxesComputeImpl(const OpContext& ctx,
       const TBlob in_data = inputs[0].reshape(src_shape);
       const TBlob out_data = outputs[0].reshape(dst_shape);
       BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-        size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim>(
-            s, out_data.shape_, req[0], in_data.shape_, sizeof(DType));
+        size_t workspace_size = broadcast::ReduceWorkspaceSize(
+            s, out_data.shape_, req[0], in_data.shape_, sizeof(OType));
         Tensor<xpu, 1, char> workspace =
             ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
         broadcast::Reduce<reducer, NDim, DType, OP, safe_acc>(
@@ -666,8 +666,8 @@ void ReduceAxesComputeBoolImpl(const OpContext& ctx,
       const TBlob in_data = inputs[0].reshape(src_shape);
       const TBlob out_data = outputs[0].reshape(dst_shape);
       BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-        size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
-            s, out_data.shape_, req[0], in_data.shape_);
+        size_t workspace_size = broadcast::ReduceWorkspaceSize(
+            s, out_data.shape_, req[0], in_data.shape_, sizeof(OType));
         Tensor<xpu, 1, char> workspace =
             ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
         broadcast::ReduceBool<reducer, NDim, DType, OP>(
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index b5e267771662..136005bd7700 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -420,10 +420,10 @@ void BinaryBroadcastRTCBackwardUseIn::operator()(const nnvm::NodeAttrs& attrs,
         const TBlob ograd = inputs[0].reshape(new_oshape);
         const TBlob lhs = inputs[1].reshape(new_lshape);
         const TBlob rhs = inputs[2].reshape(new_rshape);
-        size_t workspace_size_l = broadcast::ReduceWorkspaceSize<NDim>(
+        size_t workspace_size_l = broadcast::ReduceWorkspaceSize(
             s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_,
             rhs.shape_, common::mshadow_type_info(outputs[0].type_flag_).size);
-        size_t workspace_size_r = broadcast::ReduceWorkspaceSize<NDim>(
+        size_t workspace_size_r = broadcast::ReduceWorkspaceSize(
             s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_,
             rhs.shape_, common::mshadow_type_info(outputs[1].type_flag_).size);
         size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index a6b4d483c73d..e3ba92ddd0ff 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -628,9 +628,9 @@ inline void BinaryBroadcastBackwardUseInImpl(const OpContext& ctx,
   const TBlob ograd = inputs[0].reshape(new_oshape);
   const TBlob lhs = inputs[1].reshape(new_lshape);
   const TBlob rhs = inputs[2].reshape(new_rshape);
-  size_t workspace_size_l = ReduceWorkspaceSize<ndim>(
+  size_t workspace_size_l = ReduceWorkspaceSize(
       s, lgrad.shape_, req[0], ograd.shape_, lhs.shape_, rhs.shape_, sizeof(DType));
-  size_t workspace_size_r = ReduceWorkspaceSize<ndim>(
+  size_t workspace_size_r = ReduceWorkspaceSize(
       s, rgrad.shape_, req[1], ograd.shape_, lhs.shape_, rhs.shape_, sizeof(DType));
   size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
   Tensor<xpu, 1, char> workspace =

From cd87eb248c4e9b447631d5fdcdf7f511b7bfdd7a Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 23 Jun 2020 13:43:47 -0700
Subject: [PATCH 17/64] Fixes from rebase

---
 src/common/cuda/rtc/backward_functions-inl.h  |  23 +++-
 src/common/cuda/rtc/forward_functions-inl.h   | 113 +++++++++++++++++-
 src/common/cuda/rtc/special_functions-inl.h   |  42 +++++++
 src/common/cuda/rtc/util-inl.h                |  25 ++++
 src/common/cuda/utils.h                       |   2 +-
 .../nn/modulated_deformable_im2col.cuh        |   2 +-
 src/operator/fusion/fused_op-inl.h            |   2 +-
 .../broadcast_reduce_customized-inl.cuh       |  21 ++--
 .../numpy/linalg/np_matrix_rank-inl.h         |  12 +-
 src/operator/numpy/linalg/np_pinv-inl.h       |  42 +++++--
 src/operator/numpy/np_broadcast_reduce_op.h   |  18 ++-
 src/operator/numpy/np_cross-inl.h             |  69 ++++++++---
 .../numpy/np_elemwise_broadcast_op.cu         |  12 +-
 .../np_elemwise_broadcast_op_extended.cu      |   2 +-
 .../np_elemwise_broadcast_op_extended_sec.cu  |  33 +++--
 src/operator/numpy/np_polynomial_op.cu        |   2 +-
 src/operator/numpy/np_true_divide.cu          |   3 +-
 src/operator/tensor/broadcast_reduce-inl.cuh  |   3 +-
 src/operator/tensor/elemwise_binary_op.cc     |   5 -
 src/operator/tensor/elemwise_binary_op.h      |  17 ---
 .../tensor/elemwise_binary_scalar_op.cc       |   5 +-
 .../tensor/elemwise_unary_op_basic.cu         |   5 +-
 src/profiler/storage_profiler.cc              |   2 +-
 23 files changed, 342 insertions(+), 118 deletions(-)

diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
index e7d7b7fd14a8..52a3b2aa18d2 100644
--- a/src/common/cuda/rtc/backward_functions-inl.h
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -301,12 +301,6 @@ __device__ inline DType copysign_grad(const DType val,
   return (val >= 0 && val2 >= 0) || (val < 0 && val2 < 0) ? 1 : -1;
 }
 
-template <typename DType, typename DType2>
-__device__ inline DType zero_grad(const DType val,
-                                  const DType2 val2) {
-  return 0;
-}
-
 template <typename DType, typename DType2>
 __device__ inline DType arctan2_grad(const DType val,
                                      const DType2 val2) {
@@ -386,6 +380,23 @@ backward_gammaln(const DTypeGrad grad, const DType val) {
   }
 }
 
+template <typename DType, typename DTypeGrad>
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_digamma(const DTypeGrad grad, const DType val) {
+  if (type_util::is_same<DTypeGrad, double>::value) {
+    return grad * op::special_functions::trigamma<double>(val);
+  } else {
+    return grad * op::special_functions::trigamma<float>(val);
+  }
+}
+
+template <typename DType, typename DTypeGrad>
+__device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
+backward_gelu(const DTypeGrad grad, const DType val) {
+  return 0.5f * (grad + grad * op::erf(val / op::sqrt(2.0f)) +
+                 val * backward_erf(grad, val / op::sqrt(2.0f)) / op::sqrt(2.0f));
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType smooth_l1_grad(const DType val, const DType2 scalar) {
   auto bsq = scalar * scalar;
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 83b915b5b0e5..923c16447947 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -336,6 +336,13 @@ max(const DType a, const DType2 b) {
   return a > b ? a : b;
 }
 
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+fmax(const DType a, const DType2 b) {
+  if (isnan(b)) return a;
+  return a > b ? a : b;
+}
+
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type
 min(const DType a, const DType2 b) {
@@ -343,6 +350,13 @@ min(const DType a, const DType2 b) {
   return a < b ? a : b;
 }
 
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+fmin(const DType a, const DType2 b) {
+  if (isnan(b)) return a;
+  return a < b ? a : b;
+}
+
 DEFINE_BINARY_MATH_FUNC(hypot, ::hypot, ::hypotf)
 
 template <typename DType, typename DType2>
@@ -370,12 +384,27 @@ mod(const DType a, const DType2 b) {
   }
 }
 
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+fmod(const DType a, const DType2 b) {
+  if (b == 0) {
+    return 0;
+  }
+  return ::fmod(static_cast<double>(a), static_cast<double>(b));
+}
+
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type
 rmod(const DType a, const DType2 b) {
   return op::mod(b, a);
 }
 
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rfmod(const DType a, const DType2 b) {
+  return op::fmod(b, a);
+}
+
 template <typename DType, typename DType2>
 __device__ inline DType equal(const DType a, const DType2 b) {
   return a == static_cast<DType>(b) ? 1 : 0;
@@ -509,6 +538,14 @@ __device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_xo
   return static_cast<int64>(a) ^ static_cast<int64>(b);
 }
 
+template <typename DType>
+__device__ inline DType bitwise_not(const DType a) {
+  if (type_util::is_same<DType, bool>::value) {
+    return !a;
+  } else {
+    return ~static_cast<int64>(a);
+  }
+}
 
 DEFINE_BINARY_MATH_FUNC(arctan2, ::atan2, ::atan2f)
 
@@ -642,8 +679,8 @@ __device__ inline DType square(const DType val) {
   return val * val;
 }
 
-template <typename DType>
-__device__ inline typename LoadType<DType>::Type zero(const DType val) {
+template <typename DType, typename... DTypes>
+__device__ inline typename LoadType<DType>::Type zero(const DType val, const DTypes... args) {
   return 0;
 }
 
@@ -652,8 +689,8 @@ __device__ inline typename LoadType<DType>::Type zero() {
   return 0;
 }
 
-template <typename DType>
-__device__ inline typename LoadType<DType>::Type one(const DType val) {
+template <typename DType, typename... DTypes>
+__device__ inline typename LoadType<DType>::Type one(const DType val, const DTypes... args) {
   return 1;
 }
 
@@ -662,6 +699,16 @@ __device__ inline typename LoadType<DType>::Type one() {
   return 1;
 }
 
+template <typename DType, typename... DTypes>
+__device__ inline typename LoadType<DType>::Type negone(const DType val, const DTypes... args) {
+  return -1;
+}
+
+template <typename DType>
+__device__ inline typename LoadType<DType>::Type negone() {
+  return -1;
+}
+
 template <typename DType>
 __device__ inline DType round(const DType val) {
   if (type_util::has_double<DType>::value) {
@@ -746,6 +793,11 @@ DEFINE_UNARY_MATH_FUNC(gammaln, ::lgamma, ::lgammaf)
 DEFINE_UNARY_MATH_FUNC(erf, ::erf, ::erff)
 DEFINE_UNARY_MATH_FUNC(erfinv, ::erfinv, ::erfinvf)
 
+template <typename DType>
+__device__ inline DType gelu(const DType val) {
+  return 0.5f * val * (1.0f + op::erf(val / op::sqrt(2.0f)));
+}
+
 template <typename DType1, typename DType2>
 __device__ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) {
   const auto bsq = scalar * scalar;
@@ -759,6 +811,15 @@ __device__ inline DType1 smooth_l1(const DType1 val, const DType2 scalar) {
   }
 }
 
+template <typename DType>
+__device__ inline DType digamma(const DType val) {
+  if (type_util::has_double_or_integral<DType>::value) {
+    return special_functions::cephes::psi<double>(val);
+  } else {
+    return special_functions::cephes::psi<float>(val);
+  }
+}
+
 template <typename DType>
 __device__ inline DType logical_not(const DType val) {
   return val != DType(0) ? DType(0) : DType(1);
@@ -769,15 +830,55 @@ __device__ inline bool np_logical_not(const DType val) {
   return !static_cast<bool>(val);
 }
 
+template <typename DType, typename DType2>
+__device__ inline bool np_logical_and(const DType val, const DType2 val2) {
+  return (val && val2) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_logical_or(const DType val, const DType2 val2) {
+  return (val || val2) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_logical_xor(const DType val, const DType2 val2) {
+  return ((val || val2) && !(val && val2)) ? true : false;
+}
+
+template <typename DType>
+__device__ inline bool isnan(const DType val) {
+  return util::isnan(val);
+}
+
+template <typename DType>
+__device__ inline bool isinf(const DType val) {
+  return util::isinf(val);
+}
+
+template <typename DType>
+__device__ inline bool isposinf(const DType val) {
+  return util::isinf(val) && (val > 0);
+}
+
+template <typename DType>
+__device__ inline bool isneginf(const DType val) {
+  return util::isinf(val) && (val < 0);
+}
+
+template <typename DType>
+__device__ inline bool isfinite(const DType val) {
+  return !op::isnan(val) && !op::isinf(val);
+}
+
 #undef DEFINE_UNARY_MATH_FUNC
 
 template <typename DType, typename DType2>
-__device__ inline DType left(DType left_val, DType2 right_val) {
+__device__ inline DType left(const DType left_val, const DType2 right_val) {
   return left_val;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType2 right(DType left_val, DType2 right_val) {
+__device__ inline DType2 right(const DType left_val, const DType2 right_val) {
   return right_val;
 }
 
diff --git a/src/common/cuda/rtc/special_functions-inl.h b/src/common/cuda/rtc/special_functions-inl.h
index cec9d7891199..deede71ea06c 100644
--- a/src/common/cuda/rtc/special_functions-inl.h
+++ b/src/common/cuda/rtc/special_functions-inl.h
@@ -83,6 +83,48 @@ struct helper_numeric_limits<float> {
   }
 };
 
+template<typename DType>
+__device__ inline static DType trigamma(DType x);
+
+template<>
+__device__ inline double trigamma<double>(double x) {
+  double PI(3.14159265358979323846);
+  double sign = +1;
+  double result = 0;
+  if (x < 0.5) {
+    sign = -1;
+    const double sin_pi_x = sin(PI * x);
+    result -= (PI * PI) / (sin_pi_x * sin_pi_x);
+    x = 1 - x;
+  }
+  for (int i = 0; i < 6; ++i) {
+    result += 1 / (x * x);
+    x += 1;
+  }
+  const double ixx = 1 / (x*x);
+  result += (1 + 1 / (2*x) + ixx * (1./6 - ixx * (1./30 - ixx * (1./42)))) / x;
+  return sign * result;
+}
+
+template<>
+__device__ inline float trigamma<float>(float x) {
+  float PI(3.14159265358979323846);
+  float sign = +1;
+  float result = 0;
+  if (x < 0.5f) {
+    sign = -1;
+    const float sin_pi_x = sinf(PI * x);
+    result -= (PI * PI) / (sin_pi_x * sin_pi_x);
+    x = 1 - x;
+  }
+  for (int i = 0; i < 6; ++i) {
+    result += 1 / (x * x);
+    x += 1;
+  }
+  const float ixx = 1 / (x*x);
+  result += (1 + 1 / (2*x) + ixx * (1.f/6 - ixx * (1.f/30 - ixx * (1.f/42)))) / x;
+  return sign * result;
+}
 
 struct cephes {
   /*
diff --git a/src/common/cuda/rtc/util-inl.h b/src/common/cuda/rtc/util-inl.h
index 4fe93a72cf43..f8eac0463dd0 100644
--- a/src/common/cuda/rtc/util-inl.h
+++ b/src/common/cuda/rtc/util-inl.h
@@ -255,6 +255,31 @@ __device__ inline bool isinf(volatile const float16 &val) {
   return ::isinf(__half2float(const_cast<const float16&>(val)));
 }
 
+template <typename DType>
+__device__ inline bool isnan(volatile const DType &val) {
+  return false;
+}
+
+template <>
+__device__ inline bool isnan(volatile const float &val) {
+  return ::isnan(val);
+}
+
+template <>
+__device__ inline bool isnan(volatile const double &val) {
+  return ::isnan(val);
+}
+
+template <>
+__device__ inline bool isnan(volatile const long double &val) {
+  return ::isnan(val);
+}
+
+template <>
+__device__ inline bool isnan(volatile const float16 &val) {
+  return ::isnan(__half2float(const_cast<const float16&>(val)));
+}
+
 }  // namespace util
 )code";
 }  // namespace rtc
diff --git a/src/common/cuda/utils.h b/src/common/cuda/utils.h
index 0971cfd22361..3e4bba1919ea 100644
--- a/src/common/cuda/utils.h
+++ b/src/common/cuda/utils.h
@@ -19,7 +19,7 @@
 
 /*!
  * Copyright (c) 2015 by Contributors
- * \file cuda_utils.h
+ * \file utils.h
  * \brief Common CUDA utilities.
  */
 #ifndef MXNET_COMMON_CUDA_UTILS_H_
diff --git a/src/operator/contrib/nn/modulated_deformable_im2col.cuh b/src/operator/contrib/nn/modulated_deformable_im2col.cuh
index 16d9cef46d4e..9673edf813a4 100644
--- a/src/operator/contrib/nn/modulated_deformable_im2col.cuh
+++ b/src/operator/contrib/nn/modulated_deformable_im2col.cuh
@@ -86,7 +86,7 @@
 #include <cstring>
 #include <vector>
 #include "../../mxnet_op.h"
-#include "../../../common/cuda_utils.h"
+#include "../../../common/cuda/utils.h"
 
 
 
diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h
index d2e2fae7c482..0add7eaa99da 100644
--- a/src/operator/fusion/fused_op-inl.h
+++ b/src/operator/fusion/fused_op-inl.h
@@ -201,7 +201,7 @@ const std::map<std::string, std::vector<std::vector<std::string>>> LeakyReLU_ops
   {"gelu"                              , {{"op::gelu(%)", "_0"}}},
 };
 const std::map<std::string, std::vector<std::vector<std::string>>> LeakyReLU_bwd_ops = {
-  {"gelu"                              , {{"op::backward_gelu(%, %)", "_1", "_0"}}},
+  {"gelu"                              , {{"op::backward_gelu(%, %)", "_0", "_1"}}},
 };
 
 const std::map<std::string, std::string> slice_ops = {
diff --git a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.cuh b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.cuh
index 357ce6cd31d5..d4374edc9828 100644
--- a/src/operator/numpy/linalg/broadcast_reduce_customized-inl.cuh
+++ b/src/operator/numpy/linalg/broadcast_reduce_customized-inl.cuh
@@ -285,7 +285,7 @@ __global__ void reduce_kernel_M1_wr(const int N, const bool addto,
 template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 void ReduceImplWithReducer(cudaStream_t stream, const TBlob& small, const OpReqType req,
                            const TBlob& big, const Tensor<gpu, 1, char>& workspace,
-                           const ReduceImplConfig<ndim>& config,
+                           const ReduceImplConfig& config,
                            Reducer* reducer = nullptr) {
   bool need_clean = !reducer;
   reducer = reducer ? reducer : new Reducer();
@@ -310,13 +310,13 @@ void ReduceImplWithReducer(cudaStream_t stream, const TBlob& small, const OpReqT
 
     const int by = (config.kernel_1.do_transpose) ?
       config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
-    const bool do_unroll = ( config.M / (by*config.Mnext) >= config.unroll_reduce );
-    KERNEL_UNROLL_SWITCH(do_unroll, ReduceImplConfig<ndim>::unroll_reduce, UNROLL, {
+    const bool do_unroll = ( config.M / (by*config.Mnext) >= unroll_reduce );
+    KERNEL_UNROLL_SWITCH(do_unroll, unroll_reduce, UNROLL, {
       reduce_kernel_wr<Reducer, ndim, AType, DType, OType, OP, UNROLL>
       <<< config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, stream>>>(
         config.N, config.M, addto, big.dptr<DType>(), small_dptr, big.shape_.get<ndim>(),
-        small.shape_.get<ndim>(), config.rshape, config.rstride, config.Mnext,
-        config.kernel_1.do_transpose, reducer);
+        small.shape_.get<ndim>(), config.rshape.get<ndim>(), config.rstride.get<ndim>(),
+        config.Mnext, config.kernel_1.do_transpose, reducer);
     });
     MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel_wr);
 
@@ -335,7 +335,7 @@ void ReduceImplWithReducer(cudaStream_t stream, const TBlob& small, const OpReqT
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
 void ReduceImplWithReducer(cudaStream_t stream, const TBlob& small, const TBlob& lhs, const TBlob& rhs,
                            const OpReqType req, const TBlob& big, const Tensor<gpu, 1, char>& workspace,
-                           const ReduceImplConfig<ndim>& config, Reducer* reducer = nullptr) {
+                           const ReduceImplConfig& config, Reducer* reducer = nullptr) {
   bool need_clean = !reducer;
   reducer = reducer ? reducer : new Reducer();
   if (config.M == 1) {
@@ -360,8 +360,8 @@ void ReduceImplWithReducer(cudaStream_t stream, const TBlob& small, const TBlob&
 
     const int by = (config.kernel_1.do_transpose) ?
       config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
-    const bool do_unroll = ( config.M / (by*config.Mnext) >= config.unroll_reduce );
-    KERNEL_UNROLL_SWITCH(do_unroll, ReduceImplConfig<ndim>::unroll_reduce, UNROLL, {
+    const bool do_unroll = ( config.M / (by*config.Mnext) >= unroll_reduce );
+    KERNEL_UNROLL_SWITCH(do_unroll, unroll_reduce, UNROLL, {
       reduce_kernel_wr<Reducer, ndim, DType, OP1, OP2, UNROLL>
       <<< config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, stream>>>(
         config.N, config.M, addto, big.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>(),
@@ -393,14 +393,13 @@ void ReduceWithReducer(Stream<gpu> *s, const TBlob& small, const OpReqType req,
   cudaStream_t stream = Stream<gpu>::GetStream(s);
   bool need_clean = !reducer;
   reducer = reducer ? reducer : new Reducer();
-  ReduceImplConfig<ndim> config =
-    ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, nullptr, nullptr);
+  ReduceImplConfig config(small.shape_, big.shape_, nullptr, nullptr, sizeof(DType));
   if (safe_acc) {
     MXNET_ACC_TYPE_SWITCH(mshadow::DataType<DType>::kFlag, DataType, AType, {
       typedef typename std::conditional<safe_acc, AType, DataType>::type AccType;
       MSHADOW_TYPE_SWITCH(small.type_flag_, OType, {
         typedef typename std::conditional<safe_acc, OType, DataType>::type OutType;
-        config = ConfigureReduceImpl<ndim, AccType>(small.shape_, big.shape_, nullptr, nullptr);
+        config = ReduceImplConfig(small.shape_, big.shape_, nullptr, nullptr, sizeof(AccType));
         ReduceImplWithReducer<Reducer, ndim, AccType, DataType, OutType, OP>(
           stream, small, req, big, workspace, config, reducer);
       });
diff --git a/src/operator/numpy/linalg/np_matrix_rank-inl.h b/src/operator/numpy/linalg/np_matrix_rank-inl.h
index 8ccecb57db11..bde03c0ab845 100644
--- a/src/operator/numpy/linalg/np_matrix_rank-inl.h
+++ b/src/operator/numpy/linalg/np_matrix_rank-inl.h
@@ -410,9 +410,15 @@ void MatrixRankForwardImpl(const TBlob& a,
       if (new_tol_data.dptr<DType>() != tol.dptr<DType>()) {
         Copy(new_tol_data.FlatTo1D<xpu, DType>(s), tol.FlatTo1D<xpu, DType>(s), s);
       }
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
-                                                                 {s_data, new_tol_data},
-                                                                 {kWriteTo}, {broadcast_data});
+      if constexpr (std::is_same<xpu, cpu>::value) {
+        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
+                                                                   {s_data, new_tol_data},
+                                                                   {kWriteTo}, {broadcast_data});
+      } else {
+        mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
+                                                         {s_data, new_tol_data},
+                                                         {kWriteTo}, {broadcast_data});
+      }
       // Step5: Calculate rank.
       const int b_ndim  = broadcast_shape.ndim();
       const int data_size = broadcast_data.size(b_ndim - 1);
diff --git a/src/operator/numpy/linalg/np_pinv-inl.h b/src/operator/numpy/linalg/np_pinv-inl.h
index b3b8e0c76c64..a82597edb877 100644
--- a/src/operator/numpy/linalg/np_pinv-inl.h
+++ b/src/operator/numpy/linalg/np_pinv-inl.h
@@ -553,13 +553,23 @@ void PinvOpForwardImpl(const TBlob& a,
         s, S.size(0), Smax.dptr_, S.dptr_, S.size(1), S.stride_);
       // Step3: Calculate Cutoff.
       std::vector<OpReqType> temp_req({kWriteTo});
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                                  {rcond_data, smax_data},
-                                                                  temp_req, {cutoff_data});
-      // Step4: Calculte Large.
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
-                                                                 {s_data, cutoff_data},
-                                                                 temp_req, {large_data});
+      if constexpr (std::is_same<xpu, cpu>::value) {
+        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                                    {rcond_data, smax_data},
+                                                                    temp_req, {cutoff_data});
+        // Step4: Calculte Large.
+        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
+                                                                   {s_data, cutoff_data},
+                                                                   temp_req, {large_data});
+      } else {
+        mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                                     {rcond_data, smax_data},
+                                                     temp_req, {cutoff_data});
+        // Step4: Calculte Large.
+        mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
+                                                         {s_data, cutoff_data},
+                                                         temp_req, {large_data});
+      }
       // Step5: Discard small singular values.
       mxnet_op::Kernel<DiscardSmallSingularVal, xpu>::Launch(
         s, s_data.Size(), s_data.dptr<DType>(), large_data.dptr<DType>());
@@ -573,8 +583,13 @@ void PinvOpForwardImpl(const TBlob& a,
       }
       s_data = s_data.reshape(s_shape_newaxis);
       u_data = ut_data.reshape(ut_shape);
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
-                                                                  temp_req, {u_data});
+      if constexpr (std::is_same<xpu, cpu>::value) {
+        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
+                                                                    temp_req, {u_data});
+      } else {
+        mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
+                                                     temp_req, {u_data});
+      }
       gemm2::op(vt_data.FlatToKD<xpu, 3, DType>(s),
                 u_data.FlatToKD<xpu, 3, DType>(s),
                 pinv_a.FlatToKD<xpu, 3, DType>(s),
@@ -712,8 +727,13 @@ void PinvScalarRcondOpForwardImpl(const TBlob& a,
       }
       s_data = s_data.reshape(s_shape_newaxis);
       u_data = ut_data.reshape(ut_shape);
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
-                                                                  {kWriteTo}, {u_data});
+      if constexpr (std::is_same<xpu, cpu>::value) {
+        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
+                                                                    {kWriteTo}, {u_data});
+      } else {
+        mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
+                                                     {kWriteTo}, {u_data});
+      }
       gemm2::op(vt_data.FlatToKD<xpu, 3, DType>(s),
                 u_data.FlatToKD<xpu, 3, DType>(s),
                 pinv_a.FlatToKD<xpu, 3, DType>(s),
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index eef6188b1283..13d7e908da4c 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -832,8 +832,13 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
 
     // Compute weighted data
     TBlob wa = TBlob(temp_data_ptr, data.shape_, xpu::kDevMask);
-    BinaryBroadcastCompute<xpu, mshadow_op::mul>(
-      attrs, ctx, {data, weights}, {kWriteTo}, {wa});
+    if constexpr (std::is_same<xpu, cpu>::value) {
+      BinaryBroadcastCompute<xpu, mshadow_op::mul>(
+        attrs, ctx, {data, weights}, {kWriteTo}, {wa});
+    } else {
+      BinaryBroadcastRTCCompute {"mul"}(
+        attrs, ctx, {data, weights}, {kWriteTo}, {wa});
+    }
 
     // Compute sum of weighted data
     TBlob sum_of_wa = TBlob(temp_sum_ptr, small1, xpu::kDevMask);
@@ -850,8 +855,13 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
         ctx, {weights}, {kWriteTo}, {scl}, workspace, w_src_shape, w_dst_shape);
 
       // Compute avg and assign output
-      BinaryBroadcastCompute<xpu, mshadow_op::div>(
-        attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
+      if constexpr (std::is_same<xpu, cpu>::value) {
+        BinaryBroadcastCompute<xpu, mshadow_op::div>(
+          attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
+      } else {
+        BinaryBroadcastRTCCompute {"div"}(
+          attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
+      }
     } else {
       // Compute and assign the derivatives of a and weights
       const TBlob& igrad_a = outputs[0];
diff --git a/src/operator/numpy/np_cross-inl.h b/src/operator/numpy/np_cross-inl.h
index cb6bbdb1f137..ce0bb1fe1d8d 100644
--- a/src/operator/numpy/np_cross-inl.h
+++ b/src/operator/numpy/np_cross-inl.h
@@ -390,8 +390,13 @@ struct NumpyCrossForwardImpl {
       mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, bw_data.Size(), b_data.dptr<DType>(),
                                                    bw_data.dptr<DType>(), b_data.size(b_ndim - 1),
                                                    b_index_vec[i], b_data.Size());
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
-                                                       { kWriteTo }, { cw_data_vec[idx] });
+      if constexpr (std::is_same<xpu, cpu>::value) {
+        BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
+                                                         { kWriteTo }, { cw_data_vec[idx] });
+      } else {
+        BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
+                                          { kWriteTo }, { cw_data_vec[idx] });
+      }
       MXNET_ASSIGN_REQ_SWITCH(req_vec[i], req_type, {
         mxnet_op::Kernel<CrossOutAssign<req_type>, xpu>::Launch(s, cw_data_vec[idx].Size(),
                                                                 cw_data_vec[idx].dptr<DType>(),
@@ -493,18 +498,30 @@ struct NumpyCrossForwardImpl<xpu, DType, 2, 2> {
     mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, bw_data.Size(), b_data.dptr<DType>(),
                                                  bw_data.dptr<DType>(), b_data.size(b_ndim - 1),
                                                  1, b_data.Size());
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
-                                                     { req[0] }, { c });
+    if constexpr (std::is_same<xpu, cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
+                                                       { req[0] }, { c });
+    } else {
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
+                                        { req[0] }, { c });
+    }
     mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, aw_data.Size(), a_data.dptr<DType>(),
                                                  aw_data.dptr<DType>(), a_data.size(a_ndim - 1),
                                                  1, a_data.Size());
     mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, bw_data.Size(), b_data.dptr<DType>(),
                                                  bw_data.dptr<DType>(), b_data.size(b_ndim - 1),
                                                  0, b_data.Size());
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
-                                                     { kWriteTo }, { cw_data });
-    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx, { c, cw_data },
-                                                       { kWriteTo }, { c });
+    if constexpr (std::is_same<xpu, cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
+                                                       { kWriteTo }, { cw_data });
+      BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx, { c, cw_data },
+                                                         { kWriteTo }, { c });
+    } else {
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
+                                        { kWriteTo }, { cw_data });
+      BinaryBroadcastRTCCompute {"sub"}(attrs, ctx, { c, cw_data },
+                                        { kWriteTo }, { c });
+    }
   }
 };
 
@@ -1195,8 +1212,13 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
                                                  b_move_data.size(b_ndim - 1),
                                                  1, b_move_data.Size());
     // cw_data = grad_c_move * b_move_data[..., 1].
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
-                                                     { kWriteTo }, { cw_data });
+    if constexpr (std::is_same<xpu, cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
+                                                       { kWriteTo }, { cw_data });
+    } else {
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, bw_data },
+                                        { kWriteTo }, { cw_data });
+    }
     // Copy cw_data to grad_move_data[..., 0].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
                                                             cw_data.dptr<DType>(),
@@ -1210,8 +1232,13 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
                                                  b_move_data.size(b_ndim - 1),
                                                  0, b_move_data.Size());
     // cw_data = grad_c_move * b_move_data[..., 0].
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
-                                                     { kWriteTo }, { cw_data });
+    if constexpr (std::is_same<xpu, cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
+                                                       { kWriteTo }, { cw_data });
+    } else {
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, bw_data },
+                                        { kWriteTo }, { cw_data });
+    }
     // Copy -cw_data to grad_move_data[..., 1].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
                                                             cw_data.dptr<DType>(),
@@ -1256,8 +1283,13 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
                                                  a_move_data.size(a_ndim - 1),
                                                  1, a_move_data.Size());
     // cw_data = grad_c_move * a_move_data[..., 1].
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
-                                                     { kWriteTo }, { cw_data });
+    if constexpr (std::is_same<xpu, cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
+                                                       { kWriteTo }, { cw_data });
+    } else {
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, aw_data },
+                                        { kWriteTo }, { cw_data });
+    }
     // Copy -cw_data to grad_move_data[..., 0].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
                                                             cw_data.dptr<DType>(),
@@ -1271,8 +1303,13 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
                                                  a_move_data.size(a_ndim - 1),
                                                  0, a_move_data.Size());
     // cw_data = grad_c_move * a_move_data[..., 0].
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
-                                                     { kWriteTo }, { cw_data });
+    if constexpr (std::is_same<xpu, cpu>::value) {
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
+                                                       { kWriteTo }, { cw_data });
+    } else {
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, aw_data },
+                                        { kWriteTo }, { cw_data });
+    }
     // Copy cw_data to grad_move_data[..., 1].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
                                                             cw_data.dptr<DType>(),
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index e5a017070c2a..a6f85a8bc219 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -32,15 +32,13 @@ NNVM_REGISTER_OP(_npi_add)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"add"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_add)
-.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::posone,
-                                                                mshadow_op::posone>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"one", "one"});
 
 NNVM_REGISTER_OP(_npi_subtract)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"sub"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_sub)
-.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::posone,
-                                                                mshadow_op::negone>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"one", "negone"});
 
 NNVM_REGISTER_OP(_npi_multiply)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mul"});
@@ -52,15 +50,13 @@ NNVM_REGISTER_OP(_npi_mod)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"mod"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mod)
-.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::mod_grad,
-                                                              mshadow_op::mod_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"mod_grad", "mod_rgrad"});
 
 NNVM_REGISTER_OP(_npi_power)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"power"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_power)
-.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::power_grad,
-                                                              mshadow_op::power_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"power_grad", "power_rgrad"});
 
 NNVM_REGISTER_OP(_npi_add_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"add"});
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
index 4ec85e824cda..7dd7ba896b28 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
@@ -45,7 +45,7 @@ NNVM_REGISTER_OP(_npi_bitwise_or)
 
 NNVM_REGISTER_OP(_backward_npi_copysign)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"copysign_grad",
-                                                                     "zero_grad"});
+                                                                     "zero"});
 
 NNVM_REGISTER_OP(_npi_arctan2)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"arctan2"});
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended_sec.cu b/src/operator/numpy/np_elemwise_broadcast_op_extended_sec.cu
index fa2f3bf080c7..93d2cf18350e 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended_sec.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended_sec.cu
@@ -29,49 +29,48 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_fmax)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::fmax>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"fmax"});
 
 NNVM_REGISTER_OP(_backward_npi_fmax)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::ge,
-                                                                  mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"greater_equal", "less"});
 
 NNVM_REGISTER_OP(_npi_fmax_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::fmax>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"fmax"});
 
 NNVM_REGISTER_OP(_backward_npi_fmax_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::ge>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"greater_equal"});
 
 NNVM_REGISTER_OP(_npi_fmin)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::fmin>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"fmin"});
 
 NNVM_REGISTER_OP(_backward_npi_fmin)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::le,
-                                                                  mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"less_equal",
+                                                                     "greater"});
 
 NNVM_REGISTER_OP(_npi_fmin_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::fmin>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"fmin"});
 
 NNVM_REGISTER_OP(_backward_npi_fmin_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::le>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"less_equal"});
 
 NNVM_REGISTER_OP(_npi_fmod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::fmod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"fmod"});
 
 NNVM_REGISTER_OP(_backward_npi_fmod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastBackwardUseIn<gpu, mshadow_op::mod_grad,
-                                                                  mshadow_op::mod_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"mod_grad",
+                                                                     "mod_rgrad"});
 
 NNVM_REGISTER_OP(_npi_fmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::fmod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"fmod"});
 
 NNVM_REGISTER_OP(_backward_npi_fmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::mod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"mod_grad"});
 
 NNVM_REGISTER_OP(_npi_rfmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rfmod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"rfmod"});
 
 NNVM_REGISTER_OP(_backward_npi_rfmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rmod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCBackward{"rmod_grad"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_polynomial_op.cu b/src/operator/numpy/np_polynomial_op.cu
index 31f284b7a2a8..3c4655b2da22 100644
--- a/src/operator/numpy/np_polynomial_op.cu
+++ b/src/operator/numpy/np_polynomial_op.cu
@@ -23,7 +23,7 @@
  */
 
 #include "np_polynomial_op-inl.h"
-#include "../../common/cuda_utils.h"
+#include "../../common/cuda/utils.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
index c8eccfe140b4..757fa0d9e8a2 100644
--- a/src/operator/numpy/np_true_divide.cu
+++ b/src/operator/numpy/np_true_divide.cu
@@ -32,8 +32,7 @@ NNVM_REGISTER_OP(_npi_true_divide)
 .set_attr<FCompute>("FCompute<gpu>", TrueDivideBroadcastCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_div)
-.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::div_grad,
-                                                              mshadow_op::div_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"div_grad", "div_rgrad"});
 
 NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_attr<FCompute>("FCompute<gpu>", TrueDivideScalarCompute<gpu, mshadow_op::true_divide>);
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 9e3adffe9ef1..c7a7c478cbb3 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -387,8 +387,7 @@ void ReduceBool(Stream<gpu> *s, const TBlob& small, const OpReqType req,
                 const Tensor<gpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
   cudaStream_t stream = Stream<gpu>::GetStream(s);
-  ReduceImplConfig<ndim> config =
-    ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, nullptr, nullptr);
+  ReduceImplConfig config(small.shape_, big.shape_, nullptr, nullptr, sizeof(DType));
   ReduceImpl<Reducer, ndim, bool, DType, bool, OP>(stream, small, req, big, workspace, config);
 }
 
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
index 861e498ff7c6..86b8b5ad9f1f 100644
--- a/src/operator/tensor/elemwise_binary_op.cc
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -74,11 +74,6 @@ bool ElemwiseBinaryOp::BackwardUseInStorageType(const nnvm::NodeAttrs& attrs,
   const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
   const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback :
                            DispatchMode::kFComputeEx;
-  const int ograd_stype = in_attrs->at(0);
-  const int lhs_stype = in_attrs->at(1);
-  const int rhs_stype = in_attrs->at(2);
-  int& lhs_grad_stype = out_attrs->at(0);
-  int& rhs_grad_stype = out_attrs->at(1);
   if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
     dispatched = storage_type_assign(out_attrs, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index d1f73e9bb6ec..a6b5d7263c76 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -136,14 +136,6 @@ class ElemwiseBinaryOp : public OpBase {
       }
     });
   }
-#if MXNET_USE_CUDA
-  template<typename LOP, typename ROP>
-  static void BackwardUseNone_(const nnvm::NodeAttrs &attrs,
-                               mshadow::Stream<gpu>* s,
-                               const std::vector<TBlob> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<TBlob> &outputs);
-#endif
 
   template<typename LOP, typename ROP>
   static void BackwardUseIn_(const nnvm::NodeAttrs &attrs,
@@ -178,15 +170,6 @@ class ElemwiseBinaryOp : public OpBase {
     });
   }
 
-#if MXNET_USE_CUDA
-  template<typename LOP, typename ROP>
-  static void BackwardUseIn_(const nnvm::NodeAttrs &attrs,
-                             mshadow::Stream<gpu>* s,
-                             const std::vector<TBlob> &inputs,
-                             const std::vector<OpReqType> &req,
-                             const std::vector<TBlob> &outputs);
-#endif
-
   template<
     typename xpu,
     typename LOP,
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
index 4fd4df391d69..c5ddb5812b9f 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -70,7 +70,10 @@ __global__ void binary_scalar_kernel(const binary_scalar_kernel_params params,
     for (int i = 0; i < nvec; ++i) {
       const auto input = IType::from(loader.separate()[i]);
       // enables returning different type
-      const auto temp = OP(input, static_cast<typename IType::type>(params.scalar));
+      const auto temp = OP(input,
+                           static_cast<typename type_util::mixed_type<typename IType::type,
+                                                                      float32>::type>
+                             (params.scalar));
 
       if (req == OpReqType::kAddTo) {
         // temp2 may have a wider type than either temp
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index e241623c60cd..074f7ac69a26 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -211,11 +211,10 @@ NNVM_REGISTER_OP(_backward_gammaln)
 
 // digamma
 NNVM_REGISTER_OP(digamma)
-.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::digamma>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"digamma"});
 
 NNVM_REGISTER_OP(_backward_digamma)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
-  gpu, unary_bwd<mshadow_op::trigamma> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_digamma"});
 
 // logical not
 NNVM_REGISTER_OP(logical_not)
diff --git a/src/profiler/storage_profiler.cc b/src/profiler/storage_profiler.cc
index edb16cf32337..5bbfa5917ea9 100644
--- a/src/profiler/storage_profiler.cc
+++ b/src/profiler/storage_profiler.cc
@@ -27,7 +27,7 @@
 #include <vector>
 #include "./profiler.h"
 #include "../common/utils.h"
-#include "../common/cuda_utils.h"
+#include "../common/cuda/utils.h"
 
 namespace mxnet {
 namespace profiler {

From e7ad72b3130b200680bf47d99dc4f53da3c9ca74 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 25 Jun 2020 17:42:34 -0700
Subject: [PATCH 18/64] Guarding RTC usage behing MXNET_USE_CUDA

---
 src/operator/leaky_relu-inl.h                  |  4 ++++
 src/operator/numpy/linalg/np_matrix_rank-inl.h |  2 ++
 src/operator/numpy/linalg/np_pinv-inl.h        |  6 ++++++
 src/operator/numpy/np_broadcast_reduce_op.h    |  4 ++++
 src/operator/numpy/np_cross-inl.h              | 14 ++++++++++++++
 5 files changed, 30 insertions(+)

diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index ccb93080c65a..d87fcbbc5b6c 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -262,10 +262,12 @@ class LeakyReLUOp : public Operator {
                                        in_data[leakyrelu::kData],
                                        in_data[leakyrelu::kGamma]}, req, in_grad);
           } else {
+#if MXNET_USE_CUDA
             ElemwiseBinaryRTCBwdUseIn {"xelu_grad", "prelu_grad"}(
               nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
                                        in_data[leakyrelu::kData],
                                        in_data[leakyrelu::kGamma]}, req, in_grad);
+#endif  // MXNET_USE_CUDA
           }
         } else {
           if constexpr (std::is_same<xpu, cpu>::value) {
@@ -278,6 +280,7 @@ class LeakyReLUOp : public Operator {
                   new_lshape, new_rshape, new_oshape);
             });
           } else {
+#if MXNET_USE_CUDA
             std::vector<TBlob> new_in_grad(2);
             new_in_grad[leakyrelu::kData] = in_grad[leakyrelu::kData];
             new_in_grad[leakyrelu::kGamma] = in_grad[leakyrelu::kGamma].reshape(gshape);
@@ -286,6 +289,7 @@ class LeakyReLUOp : public Operator {
                                          in_data[leakyrelu::kData],
                                          in_data[leakyrelu::kGamma]},
                   req, new_in_grad);
+#endif  // MXNET_USE_CUDA
           }
         }
         break;
diff --git a/src/operator/numpy/linalg/np_matrix_rank-inl.h b/src/operator/numpy/linalg/np_matrix_rank-inl.h
index bde03c0ab845..9c53a06957e9 100644
--- a/src/operator/numpy/linalg/np_matrix_rank-inl.h
+++ b/src/operator/numpy/linalg/np_matrix_rank-inl.h
@@ -415,9 +415,11 @@ void MatrixRankForwardImpl(const TBlob& a,
                                                                    {s_data, new_tol_data},
                                                                    {kWriteTo}, {broadcast_data});
       } else {
+#if MXNET_USE_CUDA
         mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
                                                          {s_data, new_tol_data},
                                                          {kWriteTo}, {broadcast_data});
+#endif  // MXNET_USE_CUDA
       }
       // Step5: Calculate rank.
       const int b_ndim  = broadcast_shape.ndim();
diff --git a/src/operator/numpy/linalg/np_pinv-inl.h b/src/operator/numpy/linalg/np_pinv-inl.h
index a82597edb877..2622c1de3d15 100644
--- a/src/operator/numpy/linalg/np_pinv-inl.h
+++ b/src/operator/numpy/linalg/np_pinv-inl.h
@@ -562,6 +562,7 @@ void PinvOpForwardImpl(const TBlob& a,
                                                                    {s_data, cutoff_data},
                                                                    temp_req, {large_data});
       } else {
+#if MXNET_USE_CUDA
         mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
                                                      {rcond_data, smax_data},
                                                      temp_req, {cutoff_data});
@@ -569,6 +570,7 @@ void PinvOpForwardImpl(const TBlob& a,
         mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
                                                          {s_data, cutoff_data},
                                                          temp_req, {large_data});
+#endif  // MXNET_USE_CUDA
       }
       // Step5: Discard small singular values.
       mxnet_op::Kernel<DiscardSmallSingularVal, xpu>::Launch(
@@ -587,8 +589,10 @@ void PinvOpForwardImpl(const TBlob& a,
         mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
                                                                     temp_req, {u_data});
       } else {
+#if MXNET_USE_CUDA
         mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
                                                      temp_req, {u_data});
+#endif  // MXNET_USE_CUDA
       }
       gemm2::op(vt_data.FlatToKD<xpu, 3, DType>(s),
                 u_data.FlatToKD<xpu, 3, DType>(s),
@@ -731,8 +735,10 @@ void PinvScalarRcondOpForwardImpl(const TBlob& a,
         mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
                                                                     {kWriteTo}, {u_data});
       } else {
+#if MXNET_USE_CUDA
         mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
                                                      {kWriteTo}, {u_data});
+#endif  // MXNET_USE_CUDA
       }
       gemm2::op(vt_data.FlatToKD<xpu, 3, DType>(s),
                 u_data.FlatToKD<xpu, 3, DType>(s),
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 13d7e908da4c..45b8a6871bfb 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -836,8 +836,10 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
       BinaryBroadcastCompute<xpu, mshadow_op::mul>(
         attrs, ctx, {data, weights}, {kWriteTo}, {wa});
     } else {
+#if MXNET_USE_CUDA
       BinaryBroadcastRTCCompute {"mul"}(
         attrs, ctx, {data, weights}, {kWriteTo}, {wa});
+#endif  // MXNET_USE_CUDA
     }
 
     // Compute sum of weighted data
@@ -859,8 +861,10 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
         BinaryBroadcastCompute<xpu, mshadow_op::div>(
           attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
       } else {
+#if MXNET_USE_CUDA
         BinaryBroadcastRTCCompute {"div"}(
           attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
+#endif  // MXNET_USE_CUDA
       }
     } else {
       // Compute and assign the derivatives of a and weights
diff --git a/src/operator/numpy/np_cross-inl.h b/src/operator/numpy/np_cross-inl.h
index ce0bb1fe1d8d..813d93ff0468 100644
--- a/src/operator/numpy/np_cross-inl.h
+++ b/src/operator/numpy/np_cross-inl.h
@@ -394,8 +394,10 @@ struct NumpyCrossForwardImpl {
         BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
                                                          { kWriteTo }, { cw_data_vec[idx] });
       } else {
+#if MXNET_USE_CUDA
         BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
                                           { kWriteTo }, { cw_data_vec[idx] });
+#endif  // MXNET_USE_CUDA
       }
       MXNET_ASSIGN_REQ_SWITCH(req_vec[i], req_type, {
         mxnet_op::Kernel<CrossOutAssign<req_type>, xpu>::Launch(s, cw_data_vec[idx].Size(),
@@ -502,8 +504,10 @@ struct NumpyCrossForwardImpl<xpu, DType, 2, 2> {
       BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
                                                        { req[0] }, { c });
     } else {
+#if MXNET_USE_CUDA
       BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
                                         { req[0] }, { c });
+#endif  // MXNET_USE_CUDA
     }
     mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, aw_data.Size(), a_data.dptr<DType>(),
                                                  aw_data.dptr<DType>(), a_data.size(a_ndim - 1),
@@ -517,10 +521,12 @@ struct NumpyCrossForwardImpl<xpu, DType, 2, 2> {
       BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx, { c, cw_data },
                                                          { kWriteTo }, { c });
     } else {
+#if MXNET_USE_CUDA
       BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
                                         { kWriteTo }, { cw_data });
       BinaryBroadcastRTCCompute {"sub"}(attrs, ctx, { c, cw_data },
                                         { kWriteTo }, { c });
+#endif  // MXNET_USE_CUDA
     }
   }
 };
@@ -1216,8 +1222,10 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
       BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
                                                        { kWriteTo }, { cw_data });
     } else {
+#if MXNET_USE_CUDA
       BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, bw_data },
                                         { kWriteTo }, { cw_data });
+#endif  // MXNET_USE_CUDA
     }
     // Copy cw_data to grad_move_data[..., 0].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
@@ -1236,8 +1244,10 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
       BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
                                                        { kWriteTo }, { cw_data });
     } else {
+#if MXNET_USE_CUDA
       BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, bw_data },
                                         { kWriteTo }, { cw_data });
+#endif  // MXNET_USE_CUDA
     }
     // Copy -cw_data to grad_move_data[..., 1].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
@@ -1287,8 +1297,10 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
       BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
                                                        { kWriteTo }, { cw_data });
     } else {
+#if MXNET_USE_CUDA
       BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, aw_data },
                                         { kWriteTo }, { cw_data });
+#endif  // MXNET_USE_CUDA
     }
     // Copy -cw_data to grad_move_data[..., 0].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
@@ -1307,8 +1319,10 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
       BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
                                                        { kWriteTo }, { cw_data });
     } else {
+#if MXNET_USE_CUDA
       BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, aw_data },
                                         { kWriteTo }, { cw_data });
+#endif  // MXNET_USE_CUDA
     }
     // Copy cw_data to grad_move_data[..., 1].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),

From d889bdc683234a67837e9518ca4408f428be1d66 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 09:09:01 -0700
Subject: [PATCH 19/64] More guards

---
 src/operator/nn/layer_norm-inl.h        | 14 +++++++++-----
 src/operator/tensor/elemwise_unary_op.h |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index e6e5a41f0fac..f7cf58ad45c6 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -145,7 +145,7 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
     BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
                                       {inputs[0], outputs[layernorm::kMean]},
                                       {kWriteTo}, {outputs[0]});
-#endif
+#endif  // MXNET_USE_CUDA
   }
   // Calculate std
   const TBlob centered_out = outputs[0].reshape(red_src_shape);
@@ -190,7 +190,7 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
     BinaryBroadcastRTCCompute {"add"}(attrs, ctx,
                                       {outputs[0], beta},
                                       {kWriteTo}, {outputs[0]});
-#endif
+#endif  // MXNET_USE_CUDA
   }
 }
 
@@ -289,7 +289,7 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
     BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
                                       {normalized_data, std},
                                       {kWriteTo}, {normalized_data});
-#endif
+#endif  // MXNET_USE_CUDA
   }
   // Calculate grad_beta
   bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
@@ -313,8 +313,10 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
     ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
                                                         {kWriteTo}, {ograd_mult});
   } else {
+#if MXNET_USE_CUDA
     ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {normalized_data, ograd},
                                      {kWriteTo}, {ograd_mult});
+#endif  // MXNET_USE_CUDA
   }
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
@@ -351,7 +353,7 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
                                         {ograd_mult, std},
                                         {kWriteTo}, {ograd_mult});
-#endif
+#endif  // MXNET_USE_CUDA
     }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
@@ -375,11 +377,13 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
                                                           {kWriteTo}, {ograd_mult});
     } else {
+#if MXNET_USE_CUDA
       BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
                                         {ograd_mult, red_out},
                                         {req[0]}, {outputs[0]});
       ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
                                        {kWriteTo}, {ograd_mult});
+#endif  // MXNET_USE_CUDA
     }
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
@@ -405,7 +409,7 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
                                         {normalized_data, red_out},
                                         {kAddTo}, {outputs[0]});
-#endif
+#endif  // MXNET_USE_CUDA
     }
   }
 }
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 9e9448d056a3..0184c2c874b0 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -111,7 +111,7 @@ void inline CopyBlob(mshadow::Stream<xpu> *s,
  * \param dest Destination sparse NDArray
  * \param clone_from sparse NDArray from which to clone storage attributes
  */
-void AllocateGeometry(const NDArray *dest,
+void inline AllocateGeometry(const NDArray *dest,
                              const OpReqType req,
                              const NDArray* clone_from = nullptr) {
   if (req != kNullOp) {

From b09090ca4564a3e76367ffeb8ade45f521d24482 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 09:20:38 -0700
Subject: [PATCH 20/64] C++17 for CUDA code

---
 3rdparty/mshadow/make/mshadow.mk | 2 +-
 Makefile                         | 6 +++---
 src/common/cuda/rtc.cc           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/3rdparty/mshadow/make/mshadow.mk b/3rdparty/mshadow/make/mshadow.mk
index c647fa342ed2..8ccfad10d256 100644
--- a/3rdparty/mshadow/make/mshadow.mk
+++ b/3rdparty/mshadow/make/mshadow.mk
@@ -172,7 +172,7 @@ PS_LIB = $(addprefix $(PS_PATH)/build/, libps.a libps_main.a) \
 	$(addprefix $(PS_THIRD_PATH)/lib/, libgflags.a libzmq.a libprotobuf.a \
 	libglog.a libz.a libsnappy.a)
 	# -L$(PS_THIRD_PATH)/lib -lgflags -lzmq -lprotobuf -lglog -lz -lsnappy
-MSHADOW_NVCCFLAGS += --std=c++14
+MSHADOW_NVCCFLAGS += --std=c++17
 else
 	MSHADOW_CFLAGS+= -DMSHADOW_DIST_PS=0
 endif
diff --git a/Makefile b/Makefile
index 8d3e6f5bb50d..8f484912ae2d 100644
--- a/Makefile
+++ b/Makefile
@@ -131,9 +131,9 @@ endif
 # -L/usr/local/lib
 
 ifeq ($(DEBUG), 1)
-	NVCCFLAGS += -std=c++14 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS += -std=c++17 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else
-	NVCCFLAGS += -std=c++14 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS += -std=c++17 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 endif
 
 # CFLAGS for segfault logger
@@ -693,7 +693,7 @@ build/libtransposerowsp_lib.so:
 build/libcustomop_gpu_lib.so:
 	@mkdir -p $(@D)
 	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o /dev/null -I include/mxnet
-	$(NVCC) -shared -std=c++14 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o $@ -I include/mxnet
+	$(NVCC) -shared -std=c++17 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o $@ -I include/mxnet
 build/libsubgraph_lib.so:
 	@mkdir -p $(@D)
 	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc -o /dev/null -I include/mxnet
diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 5a0ee562f7cc..1a931ab3e20b 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -147,7 +147,7 @@ CUfunction get_function(const std::string &code,
 
     std::string gpu_arch_arg = "--gpu-architecture=compute_" + std::to_string(sm_arch);
     const char *opts[] = {gpu_arch_arg.c_str(),
-                          "--std=c++11"};
+                          "--std=c++14"};
     const std::string kernel_name_demangled = kernel_name;
     NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
 

From 7415264b5a46243ede2b27be7d82f4f1b41b77d5 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 09:45:55 -0700
Subject: [PATCH 21/64] MixedUnaryBackwardInOut as RTC

---
 src/common/cuda/rtc.cc                        |  1 +
 src/common/cuda/rtc/vectorization-inl.h       |  6 +-
 .../numpy/np_elemwise_unary_op_basic.cu       | 67 ++++++-------------
 src/operator/tensor/elemwise_unary_op.cc      |  9 +++
 src/operator/tensor/elemwise_unary_op.h       | 13 +++-
 5 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 1a931ab3e20b..4543708975ec 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -60,6 +60,7 @@ std::string to_string(OpReqType req) {
       return "OpReqType::kAddTo";
   }
   LOG(FATAL) << "Unrecognized req.";
+  return "";
 }
 
 }  // namespace util
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index a5c8c48426c4..f90b1565cdef 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -234,8 +234,8 @@ class VectorizedStorer : public VectorizedAccessor<DType, nvec, aligned> {
 
 namespace {
 
-index_t get_num_aligned_elements(const void *ptr, const index_t lead_dim,
-                                 const int nvec, const int size) {
+inline index_t get_num_aligned_elements(const void *ptr, const index_t lead_dim,
+                                        const int nvec, const int size) {
   size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
   int alignment = (ptr_as_number % (nvec * size)) / size;
   return (lead_dim + alignment + nvec - 1) / nvec;
@@ -247,7 +247,7 @@ enum class Alignment {
   DIFFERENT  // Tensors have different alignment
 };
 
-int CalcAlignment(const void *ptr, const int size) {
+inline int CalcAlignment(const void *ptr, const int size) {
   size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
   return ptr_as_number % size;
 }
diff --git a/src/operator/numpy/np_elemwise_unary_op_basic.cu b/src/operator/numpy/np_elemwise_unary_op_basic.cu
index 025abc8233e8..2f5c08c410f0 100644
--- a/src/operator/numpy/np_elemwise_unary_op_basic.cu
+++ b/src/operator/numpy/np_elemwise_unary_op_basic.cu
@@ -126,93 +126,70 @@ NNVM_REGISTER_OP(_npi_backward_nan_to_num)
 .set_attr<FCompute>("FCompute<gpu>", NumpyNanToNumOpBackward<gpu>);
 
 NNVM_REGISTER_OP(_backward_npi_exp)
-.set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::MixedUnaryBackwardUseInOutCompute<gpu,
-                    op::mshadow_op::mul>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryBwdInOutRTCCompute{"mul"});
 
 NNVM_REGISTER_OP(_backward_npi_log)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::log_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_log"});
 
 NNVM_REGISTER_OP(_backward_npi_log10)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-gpu, unary_bwd<mshadow_op::log10_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_log10"});
 
 NNVM_REGISTER_OP(_backward_npi_log2)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::log2_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_log2"});
 
 NNVM_REGISTER_OP(_backward_npi_log1p)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::log1p_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_log1p"});
 
 NNVM_REGISTER_OP(_backward_npi_expm1)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::exp> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_expm1"});
 
 NNVM_REGISTER_OP(_backward_npi_sqrt)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInOutCompute<
-  gpu, unary_bwd<mshadow_op::square_root_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", UnaryBwdInOutRTCCompute{"backward_sqrt"});
 
 NNVM_REGISTER_OP(_backward_npi_cbrt)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInOutCompute<
-  gpu, unary_bwd<mshadow_op::cube_root_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", UnaryBwdInOutRTCCompute{"backward_cbrt"});
 
 NNVM_REGISTER_OP(_backward_npi_sin)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::sin_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_sin"});
 
 NNVM_REGISTER_OP(_backward_npi_cos)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::cos_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_cos"});
 
 NNVM_REGISTER_OP(_backward_npi_tan)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInOutCompute<
-  gpu, unary_bwd<mshadow_op::tan_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", UnaryBwdInOutRTCCompute{"backward_tan"});
 
 NNVM_REGISTER_OP(_backward_npi_arcsin)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::arcsin_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arcsin"});
 
 NNVM_REGISTER_OP(_backward_npi_arccos)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::arccos_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arccos"});
 
 NNVM_REGISTER_OP(_backward_npi_arctan)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::arctan_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arctan"});
 
 NNVM_REGISTER_OP(_backward_npi_degrees)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::degrees_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_degrees"});
 
 NNVM_REGISTER_OP(_backward_npi_radians)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::radians_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_radians"});
 
 NNVM_REGISTER_OP(_backward_npi_cosh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::cosh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_cosh"});
 
 NNVM_REGISTER_OP(_backward_npi_sinh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::sinh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_sinh"});
 
 NNVM_REGISTER_OP(_backward_npi_tanh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInOutCompute<
-  gpu, unary_bwd<mshadow_op::tanh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", UnaryBwdInOutRTCCompute{"backward_tanh"});
 
 NNVM_REGISTER_OP(_backward_npi_arcsinh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::arcsinh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arcsinh"});
 
 NNVM_REGISTER_OP(_backward_npi_arccosh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::arccosh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arccosh"});
 
 NNVM_REGISTER_OP(_backward_npi_arctanh)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::MixedUnaryBackwardUseInCompute<
-  gpu, unary_bwd<mshadow_op::arctanh_grad> >);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryRTCCompute{"backward_arctanh"});
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
index c3e5369f9e67..a84d97e9ac89 100644
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -19,6 +19,7 @@
 
 #include <string>
 #include "elemwise_unary_op.h"
+#include "elemwise_binary_op.h"
 
 #if MXNET_USE_CUDA
 #include "../../common/cuda/rtc/vectorization-inl.h"
@@ -147,6 +148,14 @@ void UnaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
   }
 }
 
+void UnaryBwdInOutRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                                         const OpContext& ctx,
+                                         const std::vector<TBlob>& inputs,
+                                         const std::vector<OpReqType>& req,
+                                         const std::vector<TBlob>& outputs) {
+  ElemwiseBinaryRTCCompute {OP} (attrs, ctx, {inputs[0], inputs[2]}, req, outputs);
+}
+
 #endif  // MXNET_USE_CUDA
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 0184c2c874b0..5cb80f732e41 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -932,7 +932,18 @@ struct UnaryRTCCompute {
                   const std::vector<NDArray>& outputs);
 };
 
-#endif
+struct UnaryBwdInOutRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+
+};
+
+#endif  // MXNET_USE_CUDA
 
 }  // namespace op
 }  // namespace mxnet

From 49ee7498a41341b5d9b1b22c0c6002d661a5de82 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 11:04:26 -0700
Subject: [PATCH 22/64] Removing unused variable

---
 src/operator/tensor/elemwise_binary_op.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index a6b5d7263c76..dc44dda73822 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -769,7 +769,6 @@ template<typename xpu, typename OP>
     using namespace common;
     CHECK_EQ(inputs.size(), 3U);
     CHECK_EQ(outputs.size(), 2U);  // lhs input grad, rhs input grad
-    const auto out_grad_stype = inputs[0].storage_type();
     const auto lhs_grad_stype = outputs[0].storage_type();
     const auto rhs_grad_stype = outputs[1].storage_type();
     if (ContainsOnlyStorage(inputs, kRowSparseStorage) &&

From c23fa6a5a8bb6ff23b3085d40d47f554efcfbebe Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 11:24:26 -0700
Subject: [PATCH 23/64] Revert "C++17 for CUDA code"

This reverts commit b09090ca4564a3e76367ffeb8ade45f521d24482.
---
 3rdparty/mshadow/make/mshadow.mk | 2 +-
 Makefile                         | 6 +++---
 src/common/cuda/rtc.cc           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/3rdparty/mshadow/make/mshadow.mk b/3rdparty/mshadow/make/mshadow.mk
index 8ccfad10d256..c647fa342ed2 100644
--- a/3rdparty/mshadow/make/mshadow.mk
+++ b/3rdparty/mshadow/make/mshadow.mk
@@ -172,7 +172,7 @@ PS_LIB = $(addprefix $(PS_PATH)/build/, libps.a libps_main.a) \
 	$(addprefix $(PS_THIRD_PATH)/lib/, libgflags.a libzmq.a libprotobuf.a \
 	libglog.a libz.a libsnappy.a)
 	# -L$(PS_THIRD_PATH)/lib -lgflags -lzmq -lprotobuf -lglog -lz -lsnappy
-MSHADOW_NVCCFLAGS += --std=c++17
+MSHADOW_NVCCFLAGS += --std=c++14
 else
 	MSHADOW_CFLAGS+= -DMSHADOW_DIST_PS=0
 endif
diff --git a/Makefile b/Makefile
index 8f484912ae2d..8d3e6f5bb50d 100644
--- a/Makefile
+++ b/Makefile
@@ -131,9 +131,9 @@ endif
 # -L/usr/local/lib
 
 ifeq ($(DEBUG), 1)
-	NVCCFLAGS += -std=c++17 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS += -std=c++14 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else
-	NVCCFLAGS += -std=c++17 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS += -std=c++14 -Xcompiler -D_FORCE_INLINES -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 endif
 
 # CFLAGS for segfault logger
@@ -693,7 +693,7 @@ build/libtransposerowsp_lib.so:
 build/libcustomop_gpu_lib.so:
 	@mkdir -p $(@D)
 	$(NVCC) -shared -std=c++11 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o /dev/null -I include/mxnet
-	$(NVCC) -shared -std=c++17 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o $@ -I include/mxnet
+	$(NVCC) -shared -std=c++14 -Xcompiler -fPIC example/extensions/lib_custom_op/relu_lib.cu -o $@ -I include/mxnet
 build/libsubgraph_lib.so:
 	@mkdir -p $(@D)
 	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_subgraph/subgraph_lib.cc -o /dev/null -I include/mxnet
diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 4543708975ec..d4d97c6d6bc6 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -148,7 +148,7 @@ CUfunction get_function(const std::string &code,
 
     std::string gpu_arch_arg = "--gpu-architecture=compute_" + std::to_string(sm_arch);
     const char *opts[] = {gpu_arch_arg.c_str(),
-                          "--std=c++14"};
+                          "--std=c++11"};
     const std::string kernel_name_demangled = kernel_name;
     NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
 

From e2065f25679c64d7294e38f27d8355b85ae14831 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 11:42:51 -0700
Subject: [PATCH 24/64] Get rid of CI tests without RTC Get rid of if constexpr
 as CUDA 10 does not support it

---
 ci/docker/runtime_functions.sh                |  21 --
 ci/jenkins/Jenkins_steps.groovy               |  14 --
 ci/jenkins/Jenkinsfile_unix_gpu               |   1 -
 src/operator/leaky_relu-inl.h                 |  66 +++---
 src/operator/nn/group_norm-inl.h              | 212 ++++++++----------
 src/operator/nn/layer_norm-inl.h              | 202 ++++++++---------
 .../numpy/linalg/np_matrix_rank-inl.h         |  20 +-
 src/operator/numpy/linalg/np_pinv-inl.h       |  68 +++---
 src/operator/numpy/np_broadcast_reduce_op.h   |  32 ++-
 src/operator/numpy/np_cross-inl.h             | 120 +++++-----
 10 files changed, 330 insertions(+), 426 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 666c59e6e4aa..38404cd669f4 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -813,27 +813,6 @@ build_ubuntu_gpu_cmake() {
     ninja
 }
 
-build_ubuntu_gpu_cmake_no_rtc() {
-    set -ex
-    cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
-        -DUSE_SIGNAL_HANDLER=ON                 \
-        -DUSE_CUDA=ON                           \
-        -DUSE_CUDNN=ON                          \
-        -DUSE_MKL_IF_AVAILABLE=OFF              \
-        -DUSE_MKLML_MKL=OFF                     \
-        -DUSE_MKLDNN=ON                         \
-        -DUSE_DIST_KVSTORE=ON                   \
-        -DCMAKE_BUILD_TYPE=Release              \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DBUILD_CYTHON_MODULES=1                \
-        -DENABLE_CUDA_RTC=OFF                   \
-        -G Ninja                                \
-        /work/mxnet
-
-    ninja
-}
-
 build_ubuntu_cpu_large_tensor() {
     set -ex
     cd /work/build
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 923d41b4c383..407e7bf03bee 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -325,20 +325,6 @@ def compile_unix_cmake_gpu(lib_name) {
     }]
 }
 
-def compile_unix_cmake_gpu_no_rtc(lib_name) {
-    return ['GPU: CMake CUDA RTC OFF': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-gpu-no-rtc') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_rtc', false)
-            utils.pack_lib(lib_name, mx_cmake_lib)
-          }
-        }
-      }
-    }]
-}
-
 def compile_unix_tensorrt_gpu(lib_name) {
     return ['TensorRT': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 1fe96bf690df..6f42c60cf0a9 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -42,7 +42,6 @@ core_logic: {
     custom_steps.compile_unix_cmake_gpu('cmake_gpu'),
     custom_steps.compile_unix_tensorrt_gpu('tensorrt'),
     custom_steps.compile_unix_int64_gpu('gpu_int64'),
-    custom_steps.compile_unix_cmake_gpu_no_rtc('gpu_no_rtc'),
     custom_steps.compile_unix_full_gpu_mkldnn_cpp_test('gpu_mkldnn_cpp_test_make')
   ])
 
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index d87fcbbc5b6c..945bd00f74c1 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -254,43 +254,39 @@ class LeakyReLUOp : public Operator {
                                                          &new_rshape,
                                                          &new_oshape) != 0;
         if (!need_bc) {
-          if constexpr (std::is_same<xpu, cpu>::value) {
-            ElemwiseBinaryOp::BackwardUseIn<xpu,
-                                            mshadow_op::xelu_grad,
-                                            mshadow_op::prelu_grad>(
-              nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
-                                       in_data[leakyrelu::kData],
-                                       in_data[leakyrelu::kGamma]}, req, in_grad);
-          } else {
-#if MXNET_USE_CUDA
-            ElemwiseBinaryRTCBwdUseIn {"xelu_grad", "prelu_grad"}(
+#if !defined(__CUDACC__)
+          ElemwiseBinaryOp::BackwardUseIn<xpu,
+                                          mshadow_op::xelu_grad,
+                                          mshadow_op::prelu_grad>(
+            nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
+                                     in_data[leakyrelu::kData],
+                                     in_data[leakyrelu::kGamma]}, req, in_grad);
+#else
+          ElemwiseBinaryRTCBwdUseIn {"xelu_grad", "prelu_grad"}(
+            nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
+                                     in_data[leakyrelu::kData],
+                                     in_data[leakyrelu::kGamma]}, req, in_grad);
+#endif  // !defined(__CUDACC__)
+        } else {
+#if !defined(__CUDACC__)
+          BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
+            BinaryBroadcastBackwardUseInImpl<xpu, NDim, DType,
+              mshadow_op::xelu_grad, mshadow_op::prelu_grad>(
+                ctx, {out_grad[leakyrelu::kOut],
+                      in_data[leakyrelu::kData],
+                      in_data[leakyrelu::kGamma]}, req, in_grad,
+                new_lshape, new_rshape, new_oshape);
+          });
+#else
+          std::vector<TBlob> new_in_grad(2);
+          new_in_grad[leakyrelu::kData] = in_grad[leakyrelu::kData];
+          new_in_grad[leakyrelu::kGamma] = in_grad[leakyrelu::kGamma].reshape(gshape);
+          BinaryBroadcastRTCBackwardUseIn {"xelu_grad", "prelu_grad"}(
               nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
                                        in_data[leakyrelu::kData],
-                                       in_data[leakyrelu::kGamma]}, req, in_grad);
-#endif  // MXNET_USE_CUDA
-          }
-        } else {
-          if constexpr (std::is_same<xpu, cpu>::value) {
-            BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
-              BinaryBroadcastBackwardUseInImpl<xpu, NDim, DType,
-                mshadow_op::xelu_grad, mshadow_op::prelu_grad>(
-                  ctx, {out_grad[leakyrelu::kOut],
-                        in_data[leakyrelu::kData],
-                        in_data[leakyrelu::kGamma]}, req, in_grad,
-                  new_lshape, new_rshape, new_oshape);
-            });
-          } else {
-#if MXNET_USE_CUDA
-            std::vector<TBlob> new_in_grad(2);
-            new_in_grad[leakyrelu::kData] = in_grad[leakyrelu::kData];
-            new_in_grad[leakyrelu::kGamma] = in_grad[leakyrelu::kGamma].reshape(gshape);
-            BinaryBroadcastRTCBackwardUseIn {"xelu_grad", "prelu_grad"}(
-                nnvm::NodeAttrs(), ctx, {out_grad[leakyrelu::kOut],
-                                         in_data[leakyrelu::kData],
-                                         in_data[leakyrelu::kGamma]},
-                  req, new_in_grad);
-#endif  // MXNET_USE_CUDA
-          }
+                                       in_data[leakyrelu::kGamma]},
+                req, new_in_grad);
+#endif  // !defined(__CUDACC__)
         }
         break;
       }
diff --git a/src/operator/nn/group_norm-inl.h b/src/operator/nn/group_norm-inl.h
index 7cd2ca8997e3..da30192231c7 100644
--- a/src/operator/nn/group_norm-inl.h
+++ b/src/operator/nn/group_norm-inl.h
@@ -138,17 +138,15 @@ void GroupNormCompute(const nnvm::NodeAttrs& attrs,
   const TBlob& output_grp = outputs[groupnorm::kOut].reshape(temp_data_shape);
 
   // Calculate data = data - mean
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                       {data_grp, mean_grp},
-                                                       {kWriteTo}, {output_grp});
-  } else {
-#if MXNET_USE_CUDA
-    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
-                                      {data_grp, mean_grp},
-                                      {kWriteTo}, {output_grp});
-#endif
-  }
+#if !defined(__CUDACC__)
+  BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                     {data_grp, mean_grp},
+                                                     {kWriteTo}, {output_grp});
+#else
+  BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                    {data_grp, mean_grp},
+                                    {kWriteTo}, {output_grp});
+#endif  // !defined(__CUDACC__)
 
   // Calculate std
   const TBlob centered_out = outputs[groupnorm::kOut].reshape(red_src_shape);
@@ -163,17 +161,15 @@ void GroupNormCompute(const nnvm::NodeAttrs& attrs,
   });
 
   // Calculate data = data / std
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
-                                                 {output_grp, std_grp},
-                                                 {kWriteTo}, {output_grp});
-  } else {
-#if MXNET_USE_CUDA
-    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
-                                      {output_grp, std_grp},
-                                      {kWriteTo}, {output_grp});
-#endif
-  }
+#if !defined(__CUDACC__)
+  BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
+                                               {output_grp, std_grp},
+                                               {kWriteTo}, {output_grp});
+#else
+  BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                    {output_grp, std_grp},
+                                    {kWriteTo}, {output_grp});
+#endif  // !defined(__CUDACC__)
 
   const TBlob& output = outputs[groupnorm::kOut];
   mxnet::TShape new_param_shape(data_shape.ndim(), 1);
@@ -182,27 +178,25 @@ void GroupNormCompute(const nnvm::NodeAttrs& attrs,
   const TBlob& gamma = inputs[groupnorm::kGamma].reshape(new_param_shape);
   const TBlob& beta = inputs[groupnorm::kBeta].reshape(new_param_shape);
 
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    // Calculate data = data * gamma
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                     {output, gamma},
-                                                     {kWriteTo}, {output});
-    // Calculate data = data + beta
-    BinaryBroadcastCompute<xpu, op::mshadow_op::plus>(attrs, ctx,
-                                                     {output, beta},
-                                                     {kWriteTo}, {output});
-  } else {
-#if MXNET_USE_CUDA
-    // Calculate data = data * gamma
-    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
-                                      {output, gamma},
-                                      {kWriteTo}, {output});
-    // Calculate data = data + beta
-    BinaryBroadcastRTCCompute {"add"}(attrs, ctx,
-                                      {output, beta},
-                                      {kWriteTo}, {output});
-#endif
-  }
+#if !defined(__CUDACC__)
+  // Calculate data = data * gamma
+  BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                   {output, gamma},
+                                                   {kWriteTo}, {output});
+  // Calculate data = data + beta
+  BinaryBroadcastCompute<xpu, op::mshadow_op::plus>(attrs, ctx,
+                                                   {output, beta},
+                                                   {kWriteTo}, {output});
+#else
+  // Calculate data = data * gamma
+  BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                    {output, gamma},
+                                    {kWriteTo}, {output});
+  // Calculate data = data + beta
+  BinaryBroadcastRTCCompute {"add"}(attrs, ctx,
+                                    {output, beta},
+                                    {kWriteTo}, {output});
+#endif  // !defined(__CUDACC__)
 }
 
 /*
@@ -299,23 +293,21 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
   const TBlob red_out = TBlob(workspace.dptr_ + reduce_workspace_size + data_size * 2,
                               mean_.shape_, mean.dev_mask(), mean.type_flag_, mean.dev_id());
   // Compute normalized_data = (data - mean) / std
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                      {data_, mean_},
-                                                      {kWriteTo}, {normalized_data});
-    BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
-                                                     {normalized_data, std_},
-                                                     {kWriteTo}, {normalized_data});
-  } else {
-#if MXNET_USE_CUDA
-    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
-                                      {data_, mean_},
-                                      {kWriteTo}, {normalized_data});
-    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
-                                      {normalized_data, std_},
-                                      {kWriteTo}, {normalized_data});
-#endif
-  }
+#if !defined(__CUDACC__)
+  BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                    {data_, mean_},
+                                                    {kWriteTo}, {normalized_data});
+  BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
+                                                   {normalized_data, std_},
+                                                   {kWriteTo}, {normalized_data});
+#else
+  BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                    {data_, mean_},
+                                    {kWriteTo}, {normalized_data});
+  BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                    {normalized_data, std_},
+                                    {kWriteTo}, {normalized_data});
+#endif  // !defined(__CUDACC__)
   // Calculate grad_beta
   if (req[2] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[2].type_flag_, DType, {
@@ -327,15 +319,13 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
     });
   }
   // Calculate grad_gamma, it will be sum(ograd * normalized_data, exclude_axis)
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
-                                                        {kWriteTo}, {ograd_mult});
-  } else {
-#if MXNET_USE_CUDA
-    ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {normalized_data, ograd},
-                                     {kWriteTo}, {ograd_mult});
-#endif
-  }
+#if !defined(__CUDACC__)
+  ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
+                                                      {kWriteTo}, {ograd_mult});
+#else
+  ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {normalized_data, ograd},
+                                   {kWriteTo}, {ograd_mult});
+#endif  // !defined(__CUDACC__)
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
@@ -352,25 +342,23 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
   //               + normalized_data * (-mean(normalized_data * ograd_mult, axis))
   if (req[0] != kNullOp) {
     const TBlob output_ = outputs[0].reshape(data_.shape_);
-    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                      {inputs[0], gamma},
-                                                      {kWriteTo},
-                                                      {ograd_mult.reshape(data.shape_)});
-      BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
-                                                      {ograd_mult, std_},
-                                                      {kWriteTo}, {ograd_mult});
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
-                                        {inputs[0], gamma},
-                                        {kWriteTo},
-                                        {ograd_mult.reshape(data.shape_)});
-      BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
-                                        {ograd_mult, std_},
-                                        {kWriteTo}, {ograd_mult});
-#endif
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                    {inputs[0], gamma},
+                                                    {kWriteTo},
+                                                    {ograd_mult.reshape(data.shape_)});
+    BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
+                                                    {ograd_mult, std_},
+                                                    {kWriteTo}, {ograd_mult});
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                      {inputs[0], gamma},
+                                      {kWriteTo},
+                                      {ograd_mult.reshape(data.shape_)});
+    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                      {ograd_mult, std_},
+                                      {kWriteTo}, {ograd_mult});
+#endif  // !defined(__CUDACC__)
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
         broadcast::Reduce<mshadow_op::sum, NDim, DType, op::mshadow_op::identity, true>(
@@ -380,21 +368,19 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /= scalar<DType>(N);
     });
-    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                        {ograd_mult, red_out},
-                                                        {req[0]}, {output_});
-      ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
-                                                          {kWriteTo}, {ograd_mult});
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
-                                        {ograd_mult, red_out},
-                                        {req[0]}, {output_});
-      ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
-                                       {kWriteTo}, {ograd_mult});
-#endif
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                      {ograd_mult, red_out},
+                                                      {req[0]}, {output_});
+    ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
+                                                        {kWriteTo}, {ograd_mult});
+#else
+    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                      {ograd_mult, red_out},
+                                      {req[0]}, {output_});
+    ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
+                                     {kWriteTo}, {ograd_mult});
+#endif  // !defined(__CUDACC__)
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
         broadcast::Reduce<mshadow_op::sum, NDim, DType, op::mshadow_op::identity, true>(
@@ -404,17 +390,15 @@ void GroupNormGradCompute(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /= scalar<DType>(-N);
     });
-    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                       {normalized_data, red_out},
-                                                       {kAddTo}, {output_});
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
-                                        {normalized_data, red_out},
-                                        {kAddTo}, {output_});
-#endif
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                     {normalized_data, red_out},
+                                                     {kAddTo}, {output_});
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                      {normalized_data, red_out},
+                                      {kAddTo}, {output_});
+#endif  // !defined(__CUDACC__)
   }
 }
 
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index f7cf58ad45c6..7cf3993c784e 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -136,17 +136,15 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
     });
   });
   // Calculate data = data - mean
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                       {inputs[0], outputs[layernorm::kMean]},
-                                                       {kWriteTo}, {outputs[0]});
-  } else {
-#if MXNET_USE_CUDA
-    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
-                                      {inputs[0], outputs[layernorm::kMean]},
-                                      {kWriteTo}, {outputs[0]});
-#endif  // MXNET_USE_CUDA
-  }
+#if !defined(__CUDACC__)
+  BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                     {inputs[0], outputs[layernorm::kMean]},
+                                                     {kWriteTo}, {outputs[0]});
+#else
+  BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                    {inputs[0], outputs[layernorm::kMean]},
+                                    {kWriteTo}, {outputs[0]});
+#endif  // !defined(__CUDACC__)
   // Calculate std
   const TBlob centered_out = outputs[0].reshape(red_src_shape);
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -163,35 +161,33 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
                         + scalar<DType>(param.eps));
     });
   });
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    // Calculate data = data / std
-    BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
-                                                 {outputs[0], outputs[layernorm::kStd]},
-                                                 {kWriteTo}, {outputs[0]});
-    // Calculate data = data * gamma
-    BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
-                                                 {outputs[0], gamma},
-                                                 {kWriteTo}, {outputs[0]});
-    // Calculate data = data + beta
-    BinaryBroadcastCompute<xpu, mshadow_op::plus>(attrs, ctx,
-                                                  {outputs[0], beta},
-                                                  {kWriteTo}, {outputs[0]});
-  } else {
-#if MXNET_USE_CUDA
-    // Calculate data = data / std
-    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
-                                      {outputs[0], outputs[layernorm::kStd]},
-                                      {kWriteTo}, {outputs[0]});
-    // Calculate data = data * gamma
-    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
-                                      {outputs[0], gamma},
-                                      {kWriteTo}, {outputs[0]});
-    // Calculate data = data + beta
-    BinaryBroadcastRTCCompute {"add"}(attrs, ctx,
-                                      {outputs[0], beta},
-                                      {kWriteTo}, {outputs[0]});
-#endif  // MXNET_USE_CUDA
-  }
+#if !defined(__CUDACC__)
+  // Calculate data = data / std
+  BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
+                                               {outputs[0], outputs[layernorm::kStd]},
+                                               {kWriteTo}, {outputs[0]});
+  // Calculate data = data * gamma
+  BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
+                                               {outputs[0], gamma},
+                                               {kWriteTo}, {outputs[0]});
+  // Calculate data = data + beta
+  BinaryBroadcastCompute<xpu, mshadow_op::plus>(attrs, ctx,
+                                                {outputs[0], beta},
+                                                {kWriteTo}, {outputs[0]});
+#else
+  // Calculate data = data / std
+  BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                    {outputs[0], outputs[layernorm::kStd]},
+                                    {kWriteTo}, {outputs[0]});
+  // Calculate data = data * gamma
+  BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                    {outputs[0], gamma},
+                                    {kWriteTo}, {outputs[0]});
+  // Calculate data = data + beta
+  BinaryBroadcastRTCCompute {"add"}(attrs, ctx,
+                                    {outputs[0], beta},
+                                    {kWriteTo}, {outputs[0]});
+#endif  // !defined(__CUDACC__)
 }
 
 template<typename xpu>
@@ -274,23 +270,21 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
   const TBlob red_out = TBlob(workspace.dptr_ + reduce_workspace_size + data_size * 2,
                               mean.shape_, mean.dev_mask(), mean.type_flag_, mean.dev_id());
   // Compute normalized_data = (data - mean) / std
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    BinaryBroadcastCompute<xpu, mshadow_op::minus>(attrs, ctx,
-                                                   {data, mean},
-                                                   {kWriteTo}, {normalized_data});
-    BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
-                                                 {normalized_data, std},
+#if !defined(__CUDACC__)
+  BinaryBroadcastCompute<xpu, mshadow_op::minus>(attrs, ctx,
+                                                 {data, mean},
                                                  {kWriteTo}, {normalized_data});
-  } else {
-#if MXNET_USE_CUDA
-    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
-                                      {data, mean},
-                                      {kWriteTo}, {normalized_data});
-    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
-                                      {normalized_data, std},
-                                      {kWriteTo}, {normalized_data});
-#endif  // MXNET_USE_CUDA
-  }
+  BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
+                                               {normalized_data, std},
+                                               {kWriteTo}, {normalized_data});
+#else
+  BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                    {data, mean},
+                                    {kWriteTo}, {normalized_data});
+  BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                    {normalized_data, std},
+                                    {kWriteTo}, {normalized_data});
+#endif  // !defined(__CUDACC__)
   // Calculate grad_beta
   bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
   if (req[2] != kNullOp) {
@@ -309,15 +303,13 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
     });
   }
   // Calculate grad_gamma, it will be sum(ograd * normalized_data, exclude_axis)
-  if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-    ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
-                                                        {kWriteTo}, {ograd_mult});
-  } else {
-#if MXNET_USE_CUDA
-    ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {normalized_data, ograd},
-                                     {kWriteTo}, {ograd_mult});
-#endif  // MXNET_USE_CUDA
-  }
+#if !defined(__CUDACC__)
+  ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {normalized_data, ograd},
+                                                      {kWriteTo}, {ograd_mult});
+#else
+  ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {normalized_data, ograd},
+                                   {kWriteTo}, {ograd_mult});
+#endif  // !defined(__CUDACC__)
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
@@ -338,23 +330,21 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
   //   grad_data = ograd_mult - mean(ograd_mult, axis)
   //               + normalized_data * (-mean(normalized_data * ograd_mult, axis))
   if (req[0] != kNullOp) {
-    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                      {ograd, gamma},
-                                                      {kWriteTo}, {ograd_mult});
-      BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
-                                                      {ograd_mult, std},
-                                                      {kWriteTo}, {ograd_mult});
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
-                                        {ograd, gamma},
-                                        {kWriteTo}, {ograd_mult});
-      BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
-                                        {ograd_mult, std},
-                                        {kWriteTo}, {ograd_mult});
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                    {ograd, gamma},
+                                                    {kWriteTo}, {ograd_mult});
+    BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
+                                                    {ograd_mult, std},
+                                                    {kWriteTo}, {ograd_mult});
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                      {ograd, gamma},
+                                      {kWriteTo}, {ograd_mult});
+    BinaryBroadcastRTCCompute {"div"}(attrs, ctx,
+                                      {ograd_mult, std},
+                                      {kWriteTo}, {ograd_mult});
+#endif  // !defined(__CUDACC__)
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
         if (safe_acc) {
@@ -370,21 +360,19 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /= scalar<DType>(channel_size);
     });
-    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                        {ograd_mult, red_out},
-                                                        {req[0]}, {outputs[0]});
-      ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
-                                                          {kWriteTo}, {ograd_mult});
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
-                                        {ograd_mult, red_out},
-                                        {req[0]}, {outputs[0]});
-      ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
-                                       {kWriteTo}, {ograd_mult});
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
+                                                      {ograd_mult, red_out},
+                                                      {req[0]}, {outputs[0]});
+    ElemwiseBinaryOp::Compute<xpu, op::mshadow_op::mul>(attrs, ctx, {ograd_mult, normalized_data},
+                                                        {kWriteTo}, {ograd_mult});
+#else
+    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx,
+                                      {ograd_mult, red_out},
+                                      {req[0]}, {outputs[0]});
+    ElemwiseBinaryRTCCompute {"mul"}(attrs, ctx, {ograd_mult, normalized_data},
+                                     {kWriteTo}, {ograd_mult});
+#endif  // !defined(__CUDACC__)
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
         if (safe_acc) {
@@ -400,17 +388,15 @@ void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /=  scalar<DType>(- channel_size);
     });
-    if constexpr (std::is_same<xpu, mshadow::cpu>::value) {
-      BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
-                                                   {normalized_data, red_out},
-                                                   {kAddTo}, {outputs[0]});
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
-                                        {normalized_data, red_out},
-                                        {kAddTo}, {outputs[0]});
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
+                                                 {normalized_data, red_out},
+                                                 {kAddTo}, {outputs[0]});
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                      {normalized_data, red_out},
+                                      {kAddTo}, {outputs[0]});
+#endif  // !defined(__CUDACC__)
   }
 }
 
diff --git a/src/operator/numpy/linalg/np_matrix_rank-inl.h b/src/operator/numpy/linalg/np_matrix_rank-inl.h
index 9c53a06957e9..fa4b4c74752f 100644
--- a/src/operator/numpy/linalg/np_matrix_rank-inl.h
+++ b/src/operator/numpy/linalg/np_matrix_rank-inl.h
@@ -410,17 +410,15 @@ void MatrixRankForwardImpl(const TBlob& a,
       if (new_tol_data.dptr<DType>() != tol.dptr<DType>()) {
         Copy(new_tol_data.FlatTo1D<xpu, DType>(s), tol.FlatTo1D<xpu, DType>(s), s);
       }
-      if constexpr (std::is_same<xpu, cpu>::value) {
-        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
-                                                                   {s_data, new_tol_data},
-                                                                   {kWriteTo}, {broadcast_data});
-      } else {
-#if MXNET_USE_CUDA
-        mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
-                                                         {s_data, new_tol_data},
-                                                         {kWriteTo}, {broadcast_data});
-#endif  // MXNET_USE_CUDA
-      }
+#if !defined(__CUDACC__)
+      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
+                                                                 {s_data, new_tol_data},
+                                                                 {kWriteTo}, {broadcast_data});
+#else
+      mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
+                                                       {s_data, new_tol_data},
+                                                       {kWriteTo}, {broadcast_data});
+#endif  // !defined(__CUDACC__)
       // Step5: Calculate rank.
       const int b_ndim  = broadcast_shape.ndim();
       const int data_size = broadcast_data.size(b_ndim - 1);
diff --git a/src/operator/numpy/linalg/np_pinv-inl.h b/src/operator/numpy/linalg/np_pinv-inl.h
index 2622c1de3d15..0265e3683217 100644
--- a/src/operator/numpy/linalg/np_pinv-inl.h
+++ b/src/operator/numpy/linalg/np_pinv-inl.h
@@ -553,25 +553,23 @@ void PinvOpForwardImpl(const TBlob& a,
         s, S.size(0), Smax.dptr_, S.dptr_, S.size(1), S.stride_);
       // Step3: Calculate Cutoff.
       std::vector<OpReqType> temp_req({kWriteTo});
-      if constexpr (std::is_same<xpu, cpu>::value) {
-        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                                    {rcond_data, smax_data},
-                                                                    temp_req, {cutoff_data});
-        // Step4: Calculte Large.
-        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
-                                                                   {s_data, cutoff_data},
-                                                                   temp_req, {large_data});
-      } else {
-#if MXNET_USE_CUDA
-        mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
-                                                     {rcond_data, smax_data},
-                                                     temp_req, {cutoff_data});
-        // Step4: Calculte Large.
-        mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
-                                                         {s_data, cutoff_data},
-                                                         temp_req, {large_data});
-#endif  // MXNET_USE_CUDA
-      }
+#if !defined(__CUDACC__)
+      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
+                                                                  {rcond_data, smax_data},
+                                                                  temp_req, {cutoff_data});
+      // Step4: Calculte Large.
+      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
+                                                                 {s_data, cutoff_data},
+                                                                 temp_req, {large_data});
+#else
+      mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
+                                                   {rcond_data, smax_data},
+                                                   temp_req, {cutoff_data});
+      // Step4: Calculte Large.
+      mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
+                                                       {s_data, cutoff_data},
+                                                       temp_req, {large_data});
+#endif  // !defined(__CUDACC__)
       // Step5: Discard small singular values.
       mxnet_op::Kernel<DiscardSmallSingularVal, xpu>::Launch(
         s, s_data.Size(), s_data.dptr<DType>(), large_data.dptr<DType>());
@@ -585,15 +583,13 @@ void PinvOpForwardImpl(const TBlob& a,
       }
       s_data = s_data.reshape(s_shape_newaxis);
       u_data = ut_data.reshape(ut_shape);
-      if constexpr (std::is_same<xpu, cpu>::value) {
-        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
-                                                                    temp_req, {u_data});
-      } else {
-#if MXNET_USE_CUDA
-        mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
-                                                     temp_req, {u_data});
-#endif  // MXNET_USE_CUDA
-      }
+#if !defined(__CUDACC__)
+      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
+                                                                  temp_req, {u_data});
+#else
+      mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
+                                                   temp_req, {u_data});
+#endif  // !defined(__CUDACC__)
       gemm2::op(vt_data.FlatToKD<xpu, 3, DType>(s),
                 u_data.FlatToKD<xpu, 3, DType>(s),
                 pinv_a.FlatToKD<xpu, 3, DType>(s),
@@ -731,15 +727,13 @@ void PinvScalarRcondOpForwardImpl(const TBlob& a,
       }
       s_data = s_data.reshape(s_shape_newaxis);
       u_data = ut_data.reshape(ut_shape);
-      if constexpr (std::is_same<xpu, cpu>::value) {
-        mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
-                                                                    {kWriteTo}, {u_data});
-      } else {
-#if MXNET_USE_CUDA
-        mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
-                                                     {kWriteTo}, {u_data});
-#endif  // MXNET_USE_CUDA
-      }
+#if !defined(__CUDACC__)
+      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
+                                                                  {kWriteTo}, {u_data});
+#else
+      mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
+                                                   {kWriteTo}, {u_data});
+#endif  // !defined(__CUDACC__)
       gemm2::op(vt_data.FlatToKD<xpu, 3, DType>(s),
                 u_data.FlatToKD<xpu, 3, DType>(s),
                 pinv_a.FlatToKD<xpu, 3, DType>(s),
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 45b8a6871bfb..8fb70f7d7f45 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -832,15 +832,13 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
 
     // Compute weighted data
     TBlob wa = TBlob(temp_data_ptr, data.shape_, xpu::kDevMask);
-    if constexpr (std::is_same<xpu, cpu>::value) {
-      BinaryBroadcastCompute<xpu, mshadow_op::mul>(
-        attrs, ctx, {data, weights}, {kWriteTo}, {wa});
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(
-        attrs, ctx, {data, weights}, {kWriteTo}, {wa});
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, mshadow_op::mul>(
+      attrs, ctx, {data, weights}, {kWriteTo}, {wa});
+#else
+    BinaryBroadcastRTCCompute {"mul"}(
+      attrs, ctx, {data, weights}, {kWriteTo}, {wa});
+#endif  // !defined(__CUDACC__)
 
     // Compute sum of weighted data
     TBlob sum_of_wa = TBlob(temp_sum_ptr, small1, xpu::kDevMask);
@@ -857,15 +855,13 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
         ctx, {weights}, {kWriteTo}, {scl}, workspace, w_src_shape, w_dst_shape);
 
       // Compute avg and assign output
-      if constexpr (std::is_same<xpu, cpu>::value) {
-        BinaryBroadcastCompute<xpu, mshadow_op::div>(
-          attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
-      } else {
-#if MXNET_USE_CUDA
-        BinaryBroadcastRTCCompute {"div"}(
-          attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
-#endif  // MXNET_USE_CUDA
-      }
+#if !defined(__CUDACC__)
+      BinaryBroadcastCompute<xpu, mshadow_op::div>(
+        attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
+#else
+      BinaryBroadcastRTCCompute {"div"}(
+        attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
+#endif  // !defined(__CUDACC__)
     } else {
       // Compute and assign the derivatives of a and weights
       const TBlob& igrad_a = outputs[0];
diff --git a/src/operator/numpy/np_cross-inl.h b/src/operator/numpy/np_cross-inl.h
index 813d93ff0468..23a3a3326f5f 100644
--- a/src/operator/numpy/np_cross-inl.h
+++ b/src/operator/numpy/np_cross-inl.h
@@ -390,15 +390,13 @@ struct NumpyCrossForwardImpl {
       mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, bw_data.Size(), b_data.dptr<DType>(),
                                                    bw_data.dptr<DType>(), b_data.size(b_ndim - 1),
                                                    b_index_vec[i], b_data.Size());
-      if constexpr (std::is_same<xpu, cpu>::value) {
-        BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
-                                                         { kWriteTo }, { cw_data_vec[idx] });
-      } else {
-#if MXNET_USE_CUDA
-        BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
-                                          { kWriteTo }, { cw_data_vec[idx] });
-#endif  // MXNET_USE_CUDA
-      }
+#if !defined(__CUDACC__)
+      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
+                                                       { kWriteTo }, { cw_data_vec[idx] });
+#else
+      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
+                                        { kWriteTo }, { cw_data_vec[idx] });
+#endif  // !defined(__CUDACC__)
       MXNET_ASSIGN_REQ_SWITCH(req_vec[i], req_type, {
         mxnet_op::Kernel<CrossOutAssign<req_type>, xpu>::Launch(s, cw_data_vec[idx].Size(),
                                                                 cw_data_vec[idx].dptr<DType>(),
@@ -500,34 +498,30 @@ struct NumpyCrossForwardImpl<xpu, DType, 2, 2> {
     mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, bw_data.Size(), b_data.dptr<DType>(),
                                                  bw_data.dptr<DType>(), b_data.size(b_ndim - 1),
                                                  1, b_data.Size());
-    if constexpr (std::is_same<xpu, cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
-                                                       { req[0] }, { c });
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
-                                        { req[0] }, { c });
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
+                                                     { req[0] }, { c });
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
+                                      { req[0] }, { c });
+#endif  // !defined(__CUDACC__)
     mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, aw_data.Size(), a_data.dptr<DType>(),
                                                  aw_data.dptr<DType>(), a_data.size(a_ndim - 1),
                                                  1, a_data.Size());
     mxnet_op::Kernel<CrossInAssign, xpu>::Launch(s, bw_data.Size(), b_data.dptr<DType>(),
                                                  bw_data.dptr<DType>(), b_data.size(b_ndim - 1),
                                                  0, b_data.Size());
-    if constexpr (std::is_same<xpu, cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
-                                                       { kWriteTo }, { cw_data });
-      BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx, { c, cw_data },
-                                                         { kWriteTo }, { c });
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
-                                        { kWriteTo }, { cw_data });
-      BinaryBroadcastRTCCompute {"sub"}(attrs, ctx, { c, cw_data },
-                                        { kWriteTo }, { c });
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { aw_data, bw_data },
+                                                     { kWriteTo }, { cw_data });
+    BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx, { c, cw_data },
+                                                       { kWriteTo }, { c });
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { aw_data, bw_data },
+                                      { kWriteTo }, { cw_data });
+    BinaryBroadcastRTCCompute {"sub"}(attrs, ctx, { c, cw_data },
+                                      { kWriteTo }, { c });
+#endif  // !defined(__CUDACC__)
   }
 };
 
@@ -1218,15 +1212,13 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
                                                  b_move_data.size(b_ndim - 1),
                                                  1, b_move_data.Size());
     // cw_data = grad_c_move * b_move_data[..., 1].
-    if constexpr (std::is_same<xpu, cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
-                                                       { kWriteTo }, { cw_data });
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, bw_data },
-                                        { kWriteTo }, { cw_data });
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
+                                                     { kWriteTo }, { cw_data });
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, bw_data },
+                                      { kWriteTo }, { cw_data });
+#endif  // !defined(__CUDACC__)
     // Copy cw_data to grad_move_data[..., 0].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
                                                             cw_data.dptr<DType>(),
@@ -1240,15 +1232,13 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
                                                  b_move_data.size(b_ndim - 1),
                                                  0, b_move_data.Size());
     // cw_data = grad_c_move * b_move_data[..., 0].
-    if constexpr (std::is_same<xpu, cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
-                                                       { kWriteTo }, { cw_data });
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, bw_data },
-                                        { kWriteTo }, { cw_data });
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, bw_data },
+                                                     { kWriteTo }, { cw_data });
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, bw_data },
+                                      { kWriteTo }, { cw_data });
+#endif  // !defined(__CUDACC__)
     // Copy -cw_data to grad_move_data[..., 1].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
                                                             cw_data.dptr<DType>(),
@@ -1293,15 +1283,13 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
                                                  a_move_data.size(a_ndim - 1),
                                                  1, a_move_data.Size());
     // cw_data = grad_c_move * a_move_data[..., 1].
-    if constexpr (std::is_same<xpu, cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
-                                                       { kWriteTo }, { cw_data });
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, aw_data },
-                                        { kWriteTo }, { cw_data });
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
+                                                     { kWriteTo }, { cw_data });
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, aw_data },
+                                      { kWriteTo }, { cw_data });
+#endif  // !defined(__CUDACC__)
     // Copy -cw_data to grad_move_data[..., 0].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
                                                             cw_data.dptr<DType>(),
@@ -1315,15 +1303,13 @@ struct NumpyCrossBackwardImpl<xpu, DType, 2, 2> {
                                                  a_move_data.size(a_ndim - 1),
                                                  0, a_move_data.Size());
     // cw_data = grad_c_move * a_move_data[..., 0].
-    if constexpr (std::is_same<xpu, cpu>::value) {
-      BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
-                                                       { kWriteTo }, { cw_data });
-    } else {
-#if MXNET_USE_CUDA
-      BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, aw_data },
-                                        { kWriteTo }, { cw_data });
-#endif  // MXNET_USE_CUDA
-    }
+#if !defined(__CUDACC__)
+    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, { grad_c, aw_data },
+                                                     { kWriteTo }, { cw_data });
+#else
+    BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, { grad_c, aw_data },
+                                      { kWriteTo }, { cw_data });
+#endif  // !defined(__CUDACC__)
     // Copy cw_data to grad_move_data[..., 1].
     mxnet_op::Kernel<CrossOutAssign<kWriteTo>, xpu>::Launch(s, cw_data.Size(),
                                                             cw_data.dptr<DType>(),

From 784c082683da2a169eecd9523aca862ad2278702 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 12:02:39 -0700
Subject: [PATCH 25/64] Fix lint

---
 src/operator/tensor/elemwise_unary_op.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 5cb80f732e41..1f0610d63b62 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -940,7 +940,6 @@ struct UnaryBwdInOutRTCCompute {
                   const std::vector<TBlob>& inputs,
                   const std::vector<OpReqType>& req,
                   const std::vector<TBlob>& outputs);
-
 };
 
 #endif  // MXNET_USE_CUDA

From 69f0232a038a3edf80a74efbc4df823b0a8a47c9 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 14:17:37 -0700
Subject: [PATCH 26/64] Change a few more elemwise functions Fix for too long
 value

---
 src/common/cuda/rtc.cc                        |  1 -
 src/common/cuda/rtc/forward_functions-inl.h   | 12 +++++++
 src/common/cuda/rtc/special_functions-inl.h   | 31 ++-----------------
 .../np_elemwise_broadcast_op_extended.cu      |  8 ++---
 4 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index d4d97c6d6bc6..3af945cb4288 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -117,7 +117,6 @@ CUfunction get_function(const std::string &code,
         std::string(fp16_support_string) + "\n" +
         type_support_string + "\n" +
         util_string + "\n" +
-        float_limits() +
         special_functions_definitions + '\n' +
         function_definitions + "\n" +
         backward_function_definitions + "\n" +
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 923c16447947..932ac0bdede8 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -547,6 +547,18 @@ __device__ inline DType bitwise_not(const DType a) {
   }
 }
 
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_or(const DType a,
+                                                                       const DType2 b) {
+  return static_cast<int64>(a) | static_cast<int64>(b);
+}
+
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_and(const DType a,
+                                                                       const DType2 b) {
+  return static_cast<int64>(a) & static_cast<int64>(b);
+}
+
 DEFINE_BINARY_MATH_FUNC(arctan2, ::atan2, ::atan2f)
 
 template <typename DType, typename DType2>
diff --git a/src/common/cuda/rtc/special_functions-inl.h b/src/common/cuda/rtc/special_functions-inl.h
index deede71ea06c..228930a44a11 100644
--- a/src/common/cuda/rtc/special_functions-inl.h
+++ b/src/common/cuda/rtc/special_functions-inl.h
@@ -28,15 +28,6 @@ namespace common {
 namespace cuda {
 namespace rtc {
 
-const std::string float_limits() {
-  return std::string("constexpr double DBL_MAX = ") +
-         std::to_string(DBL_MAX) +
-         ";\n" +
-         "constexpr float FLT_MAX = " +
-         std::to_string(FLT_MAX) +
-         ";\n";
-}
-
 // This code is based on the Cephes Library availible at http://www.netlib.org/cephes
 // The original author, Stephen Moshier, has kindly given permission to use this code
 // in mxnet.  (See email below).
@@ -59,30 +50,12 @@ const std::string float_limits() {
 // Direct inquiries to 30 Frost Street, Cambridge, MA 02140
 //
 const char special_functions_definitions[] = R"code(
+constexpr double DBL_INFINITY = 1e500;
 
 namespace op {
 
 namespace special_functions {
 
-template<typename DType>
-struct helper_numeric_limits {
-  __device__ inline static DType max();
-};
-
-template<>
-struct helper_numeric_limits<double> {
-  __device__ inline static double max() {
-    return DBL_MAX;
-  }
-};
-
-template<>
-struct helper_numeric_limits<float> {
-  __device__ inline static double max() {
-    return FLT_MAX;
-  }
-};
-
 template<typename DType>
 __device__ inline static DType trigamma(DType x);
 
@@ -222,7 +195,7 @@ struct cephes {
       q = x;
       p = ::floor(q);
       if ( p == q ) {
-        return helper_numeric_limits<double>::max();
+        return DBL_INFINITY;
       }
       /* Remove the zeros of tan(PI x)
        * by subtracting the nearest integer from x
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
index 7dd7ba896b28..b1d7e71bf17d 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cu
@@ -35,13 +35,13 @@ NNVM_REGISTER_OP(_npi_lcm)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"lcm"});
 
 NNVM_REGISTER_OP(_npi_bitwise_and)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastIntCompute<gpu, mshadow_op::bitwise_and>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"bitwise_and"});
 
 NNVM_REGISTER_OP(_npi_bitwise_xor)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"bitwise_xor"});
 
 NNVM_REGISTER_OP(_npi_bitwise_or)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastIntCompute<gpu, mshadow_op::bitwise_or>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"bitwise_or"});
 
 NNVM_REGISTER_OP(_backward_npi_copysign)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCBackwardUseIn{"copysign_grad",
@@ -86,13 +86,13 @@ NNVM_REGISTER_OP(_npi_lcm_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"lcm"});
 
 NNVM_REGISTER_OP(_npi_bitwise_and_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::ComputeInt<gpu, mshadow_op::bitwise_and>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"bitwise_and"});
 
 NNVM_REGISTER_OP(_npi_bitwise_xor_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"bitwise_xor"});
 
 NNVM_REGISTER_OP(_npi_bitwise_or_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::ComputeInt<gpu, mshadow_op::bitwise_or>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarRTCCompute{"bitwise_or"});
 
 NNVM_REGISTER_OP(_npi_ldexp)
 .set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastRTCCompute{"ldexp"});

From 762f56ca5930a745db78c808b183e8108214c567 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 15:04:13 -0700
Subject: [PATCH 27/64] Fix large tensor build

---
 src/operator/tensor/broadcast_reduce-inl.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index bda0434f6005..7a0a09b7b2d6 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -469,7 +469,7 @@ inline int diff(const TShape& small, const TShape& big,
 }
 
 constexpr int nthread_reduce = 512;
-constexpr int kBaseGridNum = 1024;
+constexpr index_t kBaseGridNum = 1024;
 
 }  // namespace
 
@@ -524,8 +524,8 @@ struct ReduceImplConfig {
 
     if (M == 1) {
       kernel_1.blockDim.x = nthread_reduce;
-      kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
-          (N + kernel_1.blockDim.x - 1)/kernel_1.blockDim.x);
+      kernel_1.gridDim.x = std::min(kBaseGridNum,
+          static_cast<index_t>((N + kernel_1.blockDim.x - 1)/kernel_1.blockDim.x));
     } else {
       int reduce_strides[3];
       reduce_strides[0] = fastest_stride(small, big, big);
@@ -629,7 +629,7 @@ struct ReduceImplConfig {
       if (Mnext > 1) {
         kernel_2.blockSize = nthread_reduce;
         kernel_2.gridSize = std::min(kBaseGridNum,
-            (N + kernel_2.blockSize - 1)/kernel_2.blockSize);
+            static_cast<index_t>((N + kernel_2.blockSize - 1)/kernel_2.blockSize));
       }
     }
   }

From 2ad556edcda523c05a8d4f4fdc9e730eaba2951a Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 26 Jun 2020 16:21:24 -0700
Subject: [PATCH 28/64] Another try with DBL_MAX

---
 src/common/cuda/rtc/special_functions-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/cuda/rtc/special_functions-inl.h b/src/common/cuda/rtc/special_functions-inl.h
index 228930a44a11..50f860405ef2 100644
--- a/src/common/cuda/rtc/special_functions-inl.h
+++ b/src/common/cuda/rtc/special_functions-inl.h
@@ -50,7 +50,7 @@ namespace rtc {
 // Direct inquiries to 30 Frost Street, Cambridge, MA 02140
 //
 const char special_functions_definitions[] = R"code(
-constexpr double DBL_INFINITY = 1e500;
+constexpr double DBL_MAX = 1.7976931348623157081e+308;
 
 namespace op {
 
@@ -195,7 +195,7 @@ struct cephes {
       q = x;
       p = ::floor(q);
       if ( p == q ) {
-        return DBL_INFINITY;
+        return DBL_MAX;
       }
       /* Remove the zeros of tan(PI x)
        * by subtracting the nearest integer from x

From 21d468e693c9a71ab5361ab6df35ad70343698cd Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 29 Jun 2020 11:24:49 -0700
Subject: [PATCH 29/64] Fix Windows compilation

---
 .../numpy/linalg/np_matrix_rank-inl.h         | 22 ++++----
 src/operator/numpy/linalg/np_pinv-inl.h       | 50 ++++++++-----------
 src/operator/numpy/np_broadcast_reduce_op.h   | 23 ++++-----
 3 files changed, 44 insertions(+), 51 deletions(-)

diff --git a/src/operator/numpy/linalg/np_matrix_rank-inl.h b/src/operator/numpy/linalg/np_matrix_rank-inl.h
index fa4b4c74752f..9dba245aff0b 100644
--- a/src/operator/numpy/linalg/np_matrix_rank-inl.h
+++ b/src/operator/numpy/linalg/np_matrix_rank-inl.h
@@ -359,6 +359,14 @@ void MatrixRankNoneTolForward(const nnvm::NodeAttrs& attrs,
   MatrixRankNoneTolForwardImpl<xpu>(a, rank, attrs, ctx, req);
 }
 
+// Windows has issues with #ifdefs inside MSHADOW_TYPE_SWITCH
+#ifndef __CUDACC__
+#define NP_LINALG_MATRIX_RANK_BROADCAST(OP, RTCOP) \
+  mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::OP>
+#else
+#define NP_LINALG_MATRIX_RANK_BROADCAST(OP, RTCOP) mxnet::op::BinaryBroadcastRTCCompute {#RTCOP}
+#endif
+
 template<typename xpu>
 void MatrixRankForwardImpl(const TBlob& a,
                            const TBlob& tol,
@@ -410,15 +418,9 @@ void MatrixRankForwardImpl(const TBlob& a,
       if (new_tol_data.dptr<DType>() != tol.dptr<DType>()) {
         Copy(new_tol_data.FlatTo1D<xpu, DType>(s), tol.FlatTo1D<xpu, DType>(s), s);
       }
-#if !defined(__CUDACC__)
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
-                                                                 {s_data, new_tol_data},
-                                                                 {kWriteTo}, {broadcast_data});
-#else
-      mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
-                                                       {s_data, new_tol_data},
-                                                       {kWriteTo}, {broadcast_data});
-#endif  // !defined(__CUDACC__)
+      NP_LINALG_MATRIX_RANK_BROADCAST(gt, greater)(attrs, ctx,
+                                                   {s_data, new_tol_data},
+                                                   {kWriteTo}, {broadcast_data});
       // Step5: Calculate rank.
       const int b_ndim  = broadcast_shape.ndim();
       const int data_size = broadcast_data.size(b_ndim - 1);
@@ -431,6 +433,8 @@ void MatrixRankForwardImpl(const TBlob& a,
   });
 }
 
+#undef NP_LINALG_MATRIX_RANK_BROADCAST
+
 template<typename xpu>
 void MatrixRankForward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
diff --git a/src/operator/numpy/linalg/np_pinv-inl.h b/src/operator/numpy/linalg/np_pinv-inl.h
index 0265e3683217..c6163617de68 100644
--- a/src/operator/numpy/linalg/np_pinv-inl.h
+++ b/src/operator/numpy/linalg/np_pinv-inl.h
@@ -464,6 +464,14 @@ inline mxnet::TShape GetTransAxis(const mxnet::TShape& in_shape) {
   return mxnet::TShape(trans_axis.begin(), trans_axis.end());
 }
 
+// Windows has issues with #ifdefs inside MSHADOW_TYPE_SWITCH
+#ifndef __CUDACC__
+#define NP_LINALG_PINV_BROADCAST(OP, RTCOP) \
+  mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::OP>
+#else
+#define NP_LINALG_PINV_BROADCAST(OP, RTCOP) mxnet::op::BinaryBroadcastRTCCompute {#RTCOP}
+#endif
+
 template<typename xpu>
 void PinvOpForwardImpl(const TBlob& a,
                        const TBlob& rcond,
@@ -553,23 +561,13 @@ void PinvOpForwardImpl(const TBlob& a,
         s, S.size(0), Smax.dptr_, S.dptr_, S.size(1), S.stride_);
       // Step3: Calculate Cutoff.
       std::vector<OpReqType> temp_req({kWriteTo});
-#if !defined(__CUDACC__)
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                                  {rcond_data, smax_data},
-                                                                  temp_req, {cutoff_data});
-      // Step4: Calculte Large.
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
-                                                                 {s_data, cutoff_data},
-                                                                 temp_req, {large_data});
-#else
-      mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx,
-                                                   {rcond_data, smax_data},
-                                                   temp_req, {cutoff_data});
+      NP_LINALG_PINV_BROADCAST(mul, mul)(attrs, ctx,
+                                         {rcond_data, smax_data},
+                                         temp_req, {cutoff_data});
       // Step4: Calculte Large.
-      mxnet::op::BinaryBroadcastRTCCompute {"greater"}(attrs, ctx,
-                                                       {s_data, cutoff_data},
-                                                       temp_req, {large_data});
-#endif  // !defined(__CUDACC__)
+      NP_LINALG_PINV_BROADCAST(gt, greater)(attrs, ctx,
+                                            {s_data, cutoff_data},
+                                            temp_req, {large_data});
       // Step5: Discard small singular values.
       mxnet_op::Kernel<DiscardSmallSingularVal, xpu>::Launch(
         s, s_data.Size(), s_data.dptr<DType>(), large_data.dptr<DType>());
@@ -583,13 +581,8 @@ void PinvOpForwardImpl(const TBlob& a,
       }
       s_data = s_data.reshape(s_shape_newaxis);
       u_data = ut_data.reshape(ut_shape);
-#if !defined(__CUDACC__)
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
-                                                                  temp_req, {u_data});
-#else
-      mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
-                                                   temp_req, {u_data});
-#endif  // !defined(__CUDACC__)
+      NP_LINALG_PINV_BROADCAST(mul, mul)(attrs, ctx, {s_data, ut_data},
+                                         temp_req, {u_data});
       gemm2::op(vt_data.FlatToKD<xpu, 3, DType>(s),
                 u_data.FlatToKD<xpu, 3, DType>(s),
                 pinv_a.FlatToKD<xpu, 3, DType>(s),
@@ -727,13 +720,8 @@ void PinvScalarRcondOpForwardImpl(const TBlob& a,
       }
       s_data = s_data.reshape(s_shape_newaxis);
       u_data = ut_data.reshape(ut_shape);
-#if !defined(__CUDACC__)
-      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx, {s_data, ut_data},
-                                                                  {kWriteTo}, {u_data});
-#else
-      mxnet::op::BinaryBroadcastRTCCompute {"mul"}(attrs, ctx, {s_data, ut_data},
-                                                   {kWriteTo}, {u_data});
-#endif  // !defined(__CUDACC__)
+      NP_LINALG_PINV_BROADCAST(mul, mul)(attrs, ctx, {s_data, ut_data},
+                                         {kWriteTo}, {u_data});
       gemm2::op(vt_data.FlatToKD<xpu, 3, DType>(s),
                 u_data.FlatToKD<xpu, 3, DType>(s),
                 pinv_a.FlatToKD<xpu, 3, DType>(s),
@@ -742,6 +730,8 @@ void PinvScalarRcondOpForwardImpl(const TBlob& a,
   });
 }
 
+#undef NP_LINALG_PINV_BROADCAST
+
 template<typename xpu>
 void PinvScalarRcondOpForward(const nnvm::NodeAttrs& attrs,
                               const OpContext& ctx,
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
index 8fb70f7d7f45..3b505b788ae9 100644
--- a/src/operator/numpy/np_broadcast_reduce_op.h
+++ b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -777,6 +777,13 @@ struct avg_grad_w_1D_kernel {
   }
 };
 
+// Windows has issues with #ifdefs inside MSHADOW_TYPE_SWITCH
+#ifndef __CUDACC__
+#define NP_BROADCAST_REDUCE_OP_BROADCAST(OP) BinaryBroadcastCompute<xpu, mshadow_op::OP>
+#else
+#define NP_BROADCAST_REDUCE_OP_BROADCAST(OP) BinaryBroadcastRTCCompute {#OP}
+#endif
+
 template<typename xpu, bool back = false>
 void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
                                      const OpContext& ctx,
@@ -832,13 +839,8 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
 
     // Compute weighted data
     TBlob wa = TBlob(temp_data_ptr, data.shape_, xpu::kDevMask);
-#if !defined(__CUDACC__)
-    BinaryBroadcastCompute<xpu, mshadow_op::mul>(
-      attrs, ctx, {data, weights}, {kWriteTo}, {wa});
-#else
-    BinaryBroadcastRTCCompute {"mul"}(
+    NP_BROADCAST_REDUCE_OP_BROADCAST(mul)(
       attrs, ctx, {data, weights}, {kWriteTo}, {wa});
-#endif  // !defined(__CUDACC__)
 
     // Compute sum of weighted data
     TBlob sum_of_wa = TBlob(temp_sum_ptr, small1, xpu::kDevMask);
@@ -855,13 +857,8 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
         ctx, {weights}, {kWriteTo}, {scl}, workspace, w_src_shape, w_dst_shape);
 
       // Compute avg and assign output
-#if !defined(__CUDACC__)
-      BinaryBroadcastCompute<xpu, mshadow_op::div>(
-        attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
-#else
-      BinaryBroadcastRTCCompute {"div"}(
+      NP_BROADCAST_REDUCE_OP_BROADCAST(div)(
         attrs, ctx, {sum_of_wa, scl}, req, {avg.reshape(small1)});
-#endif  // !defined(__CUDACC__)
     } else {
       // Compute and assign the derivatives of a and weights
       const TBlob& igrad_a = outputs[0];
@@ -905,6 +902,8 @@ void NumpyWeightedAverageComputeImpl(const nnvm::NodeAttrs& attrs,
   });
 }
 
+#undef NP_BROADCAST_REDUCE_OP_BROADCAST
+
 template<typename xpu>
 void NumpyWeightedAverageForward(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,

From eb3193539096569f3aefcd8f11522bbb2fc426df Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 29 Jun 2020 11:33:10 -0700
Subject: [PATCH 30/64] Fix the large int test

---
 src/operator/tensor/elemwise_binary_scalar_op.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
index c5ddb5812b9f..38a8a47b5483 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -71,9 +71,7 @@ __global__ void binary_scalar_kernel(const binary_scalar_kernel_params params,
       const auto input = IType::from(loader.separate()[i]);
       // enables returning different type
       const auto temp = OP(input,
-                           static_cast<typename type_util::mixed_type<typename IType::type,
-                                                                      float32>::type>
-                             (params.scalar));
+                           static_cast<typename OType::type>(params.scalar));
 
       if (req == OpReqType::kAddTo) {
         // temp2 may have a wider type than either temp

From fe5cbfb1b2c036e7a6936afb98f8f69af9d941fd Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 29 Jun 2020 11:44:35 -0700
Subject: [PATCH 31/64] Add the printing of error code value to
 CUDA_DRIVER_CALL

---
 src/common/cuda/utils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/cuda/utils.h b/src/common/cuda/utils.h
index 3e4bba1919ea..03f3b0f85d8c 100644
--- a/src/common/cuda/utils.h
+++ b/src/common/cuda/utils.h
@@ -168,11 +168,11 @@ inline __device__ bool __is_supported_cuda_architecture() {
   {                                                                     \
     CUresult e = (func);                                                \
     if (e != CUDA_SUCCESS) {                                            \
-      char const * err_msg = nullptr;                                         \
+      char const * err_msg = nullptr;                                   \
       if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) {  \
         LOG(FATAL) << "CUDA Driver: Unknown error " << e;               \
       } else {                                                          \
-        LOG(FATAL) << "CUDA Driver: " << err_msg;                       \
+        LOG(FATAL) << "CUDA Driver: " << e << " " << err_msg;           \
       }                                                                 \
     }                                                                   \
   }

From 157751dece6dcc31cb7f67cb361b97582f07aef1 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 29 Jun 2020 13:36:28 -0700
Subject: [PATCH 32/64] Fix

---
 src/operator/tensor/elemwise_binary_scalar_op.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
index 38a8a47b5483..d416194c43c5 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -71,7 +71,9 @@ __global__ void binary_scalar_kernel(const binary_scalar_kernel_params params,
       const auto input = IType::from(loader.separate()[i]);
       // enables returning different type
       const auto temp = OP(input,
-                           static_cast<typename OType::type>(params.scalar));
+                           static_cast<typename type_util::mixed_type<typename IType::type,
+                                                                      typename OType::type>::type>
+                             (params.scalar));
 
       if (req == OpReqType::kAddTo) {
         // temp2 may have a wider type than either temp

From c1c807126a15a2bc9fa35eee3b274756b5aa3356 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 29 Jun 2020 15:22:46 -0700
Subject: [PATCH 33/64] Fix binary scalar

---
 src/common/cuda/rtc/forward_functions-inl.h      | 10 +++++++++-
 src/operator/tensor/elemwise_binary_scalar_op.cc |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 932ac0bdede8..0e829652accd 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -321,7 +321,15 @@ name (const DType a, const DType2 b) { \
   } \
 }
 
-DEFINE_BINARY_MATH_FUNC(power, ::pow, ::powf)
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+power (const DType a, const DType2 b) {
+  if (type_util::has_double<DType, DType2>::value) {
+    return ::pow ((double)a, (double)b); \
+  } else {
+    return ::powf ((float)a, (float)b);
+  }
+}
 
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
index d416194c43c5..e1c494818764 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -110,7 +110,7 @@ void BinaryScalarRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                            OP +
                            "\n" +
                            binary_scalar_kernel_fwd;
-  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+  const int nvec = common::mshadow_type_info(outputs[0].type_flag_).size == 8 ? 2 : 4;
 
   const index_t size = outputs[0].Size();
   binary_scalar_kernel_params params = { {inputs[0].dptr_, nullptr},

From 6d1a46d54747107a85e6aa16e465c8113c8651c9 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 30 Jun 2020 10:42:00 -0700
Subject: [PATCH 34/64] Get more information when cuLaunchKernel fails

---
 src/common/cuda/rtc.cc | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 3af945cb4288..a4c2b0646829 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -199,14 +199,22 @@ void launch(CUfunction function,
             std::vector<const void*> *args) {
   CHECK(args->size() != 0) <<
     "Empty argument list passed to a kernel.";
-  CUDA_DRIVER_CALL(
-      cuLaunchKernel(function,                    // function to launch
+  //CUDA_DRIVER_CALL(
+      CUresult err = cuLaunchKernel(function,                    // function to launch
         grid_dim.x, grid_dim.y, grid_dim.z,       // grid dim
         block_dim.x, block_dim.y, block_dim.z,    // block dim
         shared_mem_bytes,                         // shared memory
         mshadow::Stream<gpu>::GetStream(stream),  // stream
         const_cast<void**>(args->data()),         // arguments
-        nullptr));
+        nullptr);//);
+    if (err != CUDA_SUCCESS) {
+      LOG(FATAL) << "cuLaunchKernel failed: "
+                 << (void*)function << " "
+                 << "(" << grid_dim.x << ", " << grid_dim.y << ", " << grid_dim.z << ") "
+                 << "(" << block_dim.x << ", " << block_dim.y << ", " << block_dim.z << ") "
+                 << shared_mem_bytes << " "
+                 << args->size();
+    }
 }
 
 }  // namespace rtc

From 98e542b76973df6d9b650a073bb9038775ee5616 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 30 Jun 2020 11:09:50 -0700
Subject: [PATCH 35/64] Going easy on Windows compiler

---
 src/common/cuda/rtc.cc                      | 1 +
 src/common/cuda/rtc/forward_functions-inl.h | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index a4c2b0646829..77449d51a6e5 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -118,6 +118,7 @@ CUfunction get_function(const std::string &code,
         type_support_string + "\n" +
         util_string + "\n" +
         special_functions_definitions + '\n' +
+        function_definitions_util + "\n" +
         function_definitions + "\n" +
         backward_function_definitions + "\n" +
         vectorization_support_string + "\n" +
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 0e829652accd..174228d1048f 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -27,7 +27,7 @@ namespace common {
 namespace cuda {
 namespace rtc {
 
-const char function_definitions[] = R"code(
+const char function_definitions_util[] = R"code(
 
 #define INT_MAX (2147483647)
 
@@ -264,6 +264,12 @@ __device__ inline void store_add_index(const VectorType<DType, nvec> value, int
   }
 }
 
+}  // namespace op
+)code";
+
+const char function_definitions[] = R"code(
+namespace op {
+
 template <typename DType>
 __device__ inline DType identity(const DType val) {
   return val;

From 1195a027cfad857030d4fdbc7ee660ebe7346066 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 30 Jun 2020 11:46:45 -0700
Subject: [PATCH 36/64] Fix lint

---
 src/common/cuda/rtc.cc | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 77449d51a6e5..384ca9db1d4a 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -200,22 +200,22 @@ void launch(CUfunction function,
             std::vector<const void*> *args) {
   CHECK(args->size() != 0) <<
     "Empty argument list passed to a kernel.";
-  //CUDA_DRIVER_CALL(
-      CUresult err = cuLaunchKernel(function,                    // function to launch
-        grid_dim.x, grid_dim.y, grid_dim.z,       // grid dim
-        block_dim.x, block_dim.y, block_dim.z,    // block dim
-        shared_mem_bytes,                         // shared memory
-        mshadow::Stream<gpu>::GetStream(stream),  // stream
-        const_cast<void**>(args->data()),         // arguments
-        nullptr);//);
-    if (err != CUDA_SUCCESS) {
-      LOG(FATAL) << "cuLaunchKernel failed: "
-                 << (void*)function << " "
-                 << "(" << grid_dim.x << ", " << grid_dim.y << ", " << grid_dim.z << ") "
-                 << "(" << block_dim.x << ", " << block_dim.y << ", " << block_dim.z << ") "
-                 << shared_mem_bytes << " "
-                 << args->size();
-    }
+  // CUDA_DRIVER_CALL(
+  CUresult err = cuLaunchKernel(function,                    // function to launch
+    grid_dim.x, grid_dim.y, grid_dim.z,       // grid dim
+    block_dim.x, block_dim.y, block_dim.z,    // block dim
+    shared_mem_bytes,                         // shared memory
+    mshadow::Stream<gpu>::GetStream(stream),  // stream
+    const_cast<void**>(args->data()),         // arguments
+    nullptr);  // );
+  if (err != CUDA_SUCCESS) {
+    LOG(FATAL) << "cuLaunchKernel failed: "
+               << reinterpret_cast<void*>(function) << " "
+               << "(" << grid_dim.x << ", " << grid_dim.y << ", " << grid_dim.z << ") "
+               << "(" << block_dim.x << ", " << block_dim.y << ", " << block_dim.z << ") "
+               << shared_mem_bytes << " "
+               << args->size();
+  }
 }
 
 }  // namespace rtc

From d597d83f08f576c1b15a6f806b574325bf630f2e Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 30 Jun 2020 13:43:55 -0700
Subject: [PATCH 37/64] Reorganization to split strings due to Windows
 compilation problems

---
 src/common/cuda/rtc.cc                      |  3 +-
 src/common/cuda/rtc/forward_functions-inl.h | 92 +++++++++++----------
 2 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 384ca9db1d4a..07cb85cfe246 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -119,7 +119,8 @@ CUfunction get_function(const std::string &code,
         util_string + "\n" +
         special_functions_definitions + '\n' +
         function_definitions_util + "\n" +
-        function_definitions + "\n" +
+        function_definitions_binary + "\n" +
+        function_definitions_unary + "\n" +
         backward_function_definitions + "\n" +
         vectorization_support_string + "\n" +
         reducer + "\n";
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 174228d1048f..7dfafc86de9b 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -267,19 +267,9 @@ __device__ inline void store_add_index(const VectorType<DType, nvec> value, int
 }  // namespace op
 )code";
 
-const char function_definitions[] = R"code(
+const char function_definitions_binary[] = R"code(
 namespace op {
 
-template <typename DType>
-__device__ inline DType identity(const DType val) {
-  return val;
-}
-
-template <typename DType>
-__device__ inline DType negation(const DType val) {
-  return -val;
-}
-
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type
 add(const DType a, const DType2 b) {
@@ -552,15 +542,6 @@ __device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_xo
   return static_cast<int64>(a) ^ static_cast<int64>(b);
 }
 
-template <typename DType>
-__device__ inline DType bitwise_not(const DType a) {
-  if (type_util::is_same<DType, bool>::value) {
-    return !a;
-  } else {
-    return ~static_cast<int64>(a);
-  }
-}
-
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_or(const DType a,
                                                                        const DType2 b) {
@@ -599,6 +580,47 @@ rldexp(const DType a, const DType2 b) {
 
 #undef DEFINE_BINARY_MATH_FUNC
 
+template <typename DType, typename DType2>
+__device__ inline bool np_logical_and(const DType val, const DType2 val2) {
+  return (val && val2) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_logical_or(const DType val, const DType2 val2) {
+  return (val || val2) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline bool np_logical_xor(const DType val, const DType2 val2) {
+  return ((val || val2) && !(val && val2)) ? true : false;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType left(const DType left_val, const DType2 right_val) {
+  return left_val;
+}
+
+template <typename DType, typename DType2>
+__device__ inline DType2 right(const DType left_val, const DType2 right_val) {
+  return right_val;
+}
+
+}  // namespace op
+)code";
+
+const char function_definitions_unary[] = R"code(
+namespace op {
+
+template <typename DType>
+__device__ inline DType identity(const DType val) {
+  return val;
+}
+
+template <typename DType>
+__device__ inline DType negation(const DType val) {
+  return -val;
+}
+
 template <typename OutType, typename DType>
 __device__ inline typename LoadType<OutType>::Type cast(const DType val) {
   return static_cast<typename LoadType<OutType>::Type>(val);
@@ -856,21 +878,6 @@ __device__ inline bool np_logical_not(const DType val) {
   return !static_cast<bool>(val);
 }
 
-template <typename DType, typename DType2>
-__device__ inline bool np_logical_and(const DType val, const DType2 val2) {
-  return (val && val2) ? true : false;
-}
-
-template <typename DType, typename DType2>
-__device__ inline bool np_logical_or(const DType val, const DType2 val2) {
-  return (val || val2) ? true : false;
-}
-
-template <typename DType, typename DType2>
-__device__ inline bool np_logical_xor(const DType val, const DType2 val2) {
-  return ((val || val2) && !(val && val2)) ? true : false;
-}
-
 template <typename DType>
 __device__ inline bool isnan(const DType val) {
   return util::isnan(val);
@@ -898,14 +905,13 @@ __device__ inline bool isfinite(const DType val) {
 
 #undef DEFINE_UNARY_MATH_FUNC
 
-template <typename DType, typename DType2>
-__device__ inline DType left(const DType left_val, const DType2 right_val) {
-  return left_val;
-}
-
-template <typename DType, typename DType2>
-__device__ inline DType2 right(const DType left_val, const DType2 right_val) {
-  return right_val;
+template <typename DType>
+__device__ inline DType bitwise_not(const DType a) {
+  if (type_util::is_same<DType, bool>::value) {
+    return !a;
+  } else {
+    return ~static_cast<int64>(a);
+  }
 }
 
 }  // namespace op

From 67d115c044320867d80d9d66c3cfad7ee7178d86 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 2 Jul 2020 08:59:52 -0700
Subject: [PATCH 38/64] Fix error with uninitialized value

---
 src/operator/tensor/broadcast_reduce-inl.h | 2 ++
 src/operator/tensor/reduce_rtc.cc          | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 7a0a09b7b2d6..3b7fec7d60fc 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -521,6 +521,8 @@ struct ReduceImplConfig {
     }
 
     workspace_size = 0;
+    kernel_1.shMemSize = 0;
+    kernel_1.do_transpose = false;
 
     if (M == 1) {
       kernel_1.blockDim.x = nthread_reduce;
diff --git a/src/operator/tensor/reduce_rtc.cc b/src/operator/tensor/reduce_rtc.cc
index 84a60931ae65..b14a636b0469 100644
--- a/src/operator/tensor/reduce_rtc.cc
+++ b/src/operator/tensor/reduce_rtc.cc
@@ -396,7 +396,7 @@ void RTCReduceM1Impl(Stream<gpu> *s, const TBlob &small, const TBlob &big,
                                             "reduce_kernel_M1", dev_id);
   launch(reduce_kernel_M1_func, config.kernel_1.gridDim,
          config.kernel_1.blockDim,
-         config.kernel_1.shMemSize, s, &args);
+         0, s, &args);
 }
 
 }  // namespace

From edf3a8ab1171fb671555553c0fe1f35aa9549823 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 2 Jul 2020 14:07:01 -0700
Subject: [PATCH 39/64] Fix handling of different types for backward of binary
 scalar

---
 src/operator/tensor/elemwise_binary_scalar_op.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
index e1c494818764..2ba249acfe87 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -196,7 +196,10 @@ __global__ void binary_scalar_kernel_bwd(const binary_scalar_kernel_params param
       const auto input = IType::from(input_loader.separate()[i]);
       // enables returning different type
       const auto temp = op::mul(ograd,
-                                OP(input, static_cast<typename IType::type>(params.scalar)));
+                                OP(input,
+                                   static_cast<typename type_util::mixed_type<typename IType::type,
+                                                                              typename OType::type>
+                                               ::type>(params.scalar));
 
       if (req == OpReqType::kAddTo) {
         // temp2 may have a wider type than either temp

From 71deaff60c04dbab8905c75b94269b945a1f17f7 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 2 Jul 2020 15:39:10 -0700
Subject: [PATCH 40/64] Decreasing RTC overhead

---
 src/common/cuda/rtc.cc                        | 11 ++--
 src/common/cuda/rtc.h                         |  3 +-
 src/common/cuda/rtc/vectorization-inl.h       | 59 +++++++++----------
 src/operator/fusion/fused_op.cu               |  2 +-
 .../tensor/elemwise_binary_broadcast_op.cc    | 22 +++----
 src/operator/tensor/elemwise_binary_op.cc     | 28 ++++-----
 .../tensor/elemwise_binary_scalar_op.cc       | 14 ++---
 src/operator/tensor/elemwise_sum.cu           | 12 ++--
 src/operator/tensor/elemwise_unary_op.cc      |  6 +-
 src/operator/tensor/reduce_rtc.cc             | 18 ++++--
 10 files changed, 90 insertions(+), 85 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 07cb85cfe246..d0b84495f759 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -93,8 +93,9 @@ std::string GetPtx(nvrtcProgram program) {
 
 }  // namespace
 
-CUfunction get_function(const std::string &code,
+CUfunction get_function(const std::string &parameters,
                         const std::string &kernel_name,
+                        const std::string &code,
                         int dev_id) {
   constexpr int CACHESIZE_WARN_THRESHOLD = 10000;
   std::lock_guard<std::mutex> l(lock);
@@ -104,13 +105,13 @@ CUfunction get_function(const std::string &code,
     std::string ptx;
     std::vector<CUfunction> functions;
   };
-  // Maps from the cuda source code (minus header) to the ptx and jit-compiled CUfunctions.
+  // Maps from the kernel name and parameters to the ptx and jit-compiled CUfunctions.
   using KernelCache = std::unordered_map<std::string, KernelInfo>;
   // Per-gpu-architecture compiled kernel cache with jit-compiled function for each device context
   static std::unordered_map<int32_t, KernelCache> compiled_kernels;
   int sm_arch = SMArch(dev_id);
-  KernelCache& compiled_kernels_this_arch = compiled_kernels[sm_arch];  // make null map as needed
-  KernelInfo& kinfo = compiled_kernels_this_arch[code];                 // make KernelInfo as needed
+  KernelCache& compiled_kernels_this_arch = compiled_kernels[sm_arch];       // make null map as needed
+  KernelInfo& kinfo = compiled_kernels_this_arch[parameters + kernel_name];  // make KernelInfo as needed
   if (kinfo.ptx.size() == 0) {
     // It's the first time we've seen this kernel, so we need to generate the ptx and mangled_name.
     static std::string common_header =
@@ -124,7 +125,7 @@ CUfunction get_function(const std::string &code,
         backward_function_definitions + "\n" +
         vectorization_support_string + "\n" +
         reducer + "\n";
-    std::string code_with_header = common_header + code;
+    std::string code_with_header = common_header + parameters + code;
     // If verbose mode, output kernel source, though not including the common header
     if (dmlc::GetEnv("MXNET_RTC_VERBOSE", false)) {
       LOG(INFO) << "\n" << std::string(80, '-') << "\n" << code;
diff --git a/src/common/cuda/rtc.h b/src/common/cuda/rtc.h
index b458691b63a7..ea95346a0468 100644
--- a/src/common/cuda/rtc.h
+++ b/src/common/cuda/rtc.h
@@ -52,8 +52,9 @@ std::string to_string(OpReqType req);
 
 extern std::mutex lock;
 
-CUfunction get_function(const std::string &code,
+CUfunction get_function(const std::string &parameters,
                         const std::string &kernel_name,
+                        const std::string &code,
                         int dev_id);
 
 void launch(CUfunction function,
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index f90b1565cdef..40fa84422299 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -315,8 +315,9 @@ constexpr int vectorized_kernel_thread_num = 512;
 }  // namespace
 
 template <typename Params>
-void VectorizedKernelRTCLauncher(const std::string &code,
+void VectorizedKernelRTCLauncher(const std::string &parameters,
                                  const std::string &kernel_name,
+                                 const std::string &code,
                                  int nvec,
                                  const index_t lead_dim,
                                  const index_t other_dim,
@@ -331,18 +332,18 @@ void VectorizedKernelRTCLauncher(const std::string &code,
   if (N != 0) {
     auto align = CheckAlignment(params, lead_dim, other_dim,
                                 nvec, inputs, outputs);
-    std::stringstream kernel_builder;
+    std::string kernel_builder;
+    kernel_builder.reserve(2560);
 
     // Fill input types
     int counter = 0;
     for (const auto& input : inputs) {
       const auto& type_info = common::mshadow_type_info(input.type_flag_);
-      kernel_builder << "using InputType"
-                     << counter
-                     << " = "
-                     << type_info.name
-                     << ";"
-                     << std::endl;
+      kernel_builder += "using InputType";
+      kernel_builder += std::to_string(counter);
+      kernel_builder += " = ";
+      kernel_builder += type_info.name;
+      kernel_builder += ";\n";
       ++counter;
     }
 
@@ -350,44 +351,37 @@ void VectorizedKernelRTCLauncher(const std::string &code,
     counter = 0;
     for (const auto& output : outputs) {
       const auto& type_info = common::mshadow_type_info(output.type_flag_);
-      kernel_builder << "using OutputType"
-                     << counter
-                     << " = "
-                     << type_info.name
-                     << ";"
-                     << std::endl;
+      kernel_builder += "using OutputType";
+      kernel_builder += std::to_string(counter);
+      kernel_builder += " = ";
+      kernel_builder += type_info.name;
+      kernel_builder += ";\n";
       ++counter;
     }
 
     switch (align) {
       case Alignment::SAME_ALIGNED:
-        kernel_builder << "const bool aligned = true;"
-                       << std::endl
-                       << "const int nvec = "
-                       << nvec
-                       << ";"
-                       << std::endl;
+        kernel_builder += "const bool aligned = true;\n"
+                          "const int nvec = ";
+        kernel_builder += std::to_string(nvec);
+        kernel_builder += ";\n";
         break;
       case Alignment::SAME_UNALIGNED:
-        kernel_builder << "const bool aligned = false;"
-                       << std::endl
-                       << "const int nvec = "
-                       << nvec
-                       << ";"
-                       << std::endl;
+        kernel_builder += "const bool aligned = false;\n"
+                          "const int nvec = ";
+        kernel_builder += std::to_string(nvec);
+        kernel_builder += ";\n";
         break;
       case Alignment::DIFFERENT: {
         // If the pointers are aligned differently we cannot vectorize
-        kernel_builder << "const bool aligned = true;"
-                       << std::endl
-                       << "const int nvec = 1;"
-                       << std::endl;
+        kernel_builder += "const bool aligned = true;\n"
+                          "const int nvec = 1;\n";
         nvec = 1;
         break;
       }
     }
 
-    kernel_builder << code;
+    kernel_builder += parameters;
 
     index_t num_aligned_elements = get_num_aligned_elements(
                                     params.inputs[lead_input_num],
@@ -401,8 +395,9 @@ void VectorizedKernelRTCLauncher(const std::string &code,
                               max_blocks);
     std::vector<const void*> args = {&params, &lead_dim, &other_dim,
                                      &N, &num_aligned_elements};
-    auto function = common::cuda::rtc::get_function(kernel_builder.str(),
+    auto function = common::cuda::rtc::get_function(kernel_builder,
                                                     kernel_name,
+                                                    code,
                                                     dev_id);
 
     common::cuda::rtc::launch(function,
diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu
index 6b892ce68349..9b6b7d8321d9 100644
--- a/src/operator/fusion/fused_op.cu
+++ b/src/operator/fusion/fused_op.cu
@@ -567,7 +567,7 @@ std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
 CUfunction FusedOp::CompileCode(const std::string &code,
                                 const std::string &kernel_name,
                                 int dev_id) {
-  return common::cuda::rtc::get_function(code, "FusedKernel_" + kernel_name, dev_id);
+  return common::cuda::rtc::get_function(code, "FusedKernel_" + kernel_name, "", dev_id);
 }
 
 
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index 136005bd7700..4a84ae5f8f40 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -319,10 +319,13 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                        "const int ndim = " +
                        std::to_string(ndim) +
                        ";\n";
-    std::string kernel_name;
     if (common_shape != 1) {
-      code += broadcast_kernel_fwd;
-      kernel_name = "binary_broadcast_kernel";
+      VectorizedKernelRTCLauncher(code, "binary_broadcast_kernel",
+                                  broadcast_kernel_fwd, nvec,
+                                  lead_dim, other_dim, s, params,
+                                  inputs, outputs,
+                                  ctx.run_ctx.get_ctx().dev_id,
+                                  lead_input_num);
     } else {
       if (params.stride[0][first_different] == 0) {
         lead_input_num = 1;
@@ -334,15 +337,14 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                 "using DType = InputType0;\n"
                 "using DType2 = InputType1;\n";
       }
-      code += single_side_broadcast_kernel_fwd;
-      kernel_name = "single_side_binary_broadcast_kernel";
+      VectorizedKernelRTCLauncher(code, "single_side_binary_broadcast_kernel",
+                                  single_side_broadcast_kernel_fwd, nvec,
+                                  lead_dim, other_dim, s, params,
+                                  inputs, outputs,
+                                  ctx.run_ctx.get_ctx().dev_id,
+                                  lead_input_num);
     }
 
-    VectorizedKernelRTCLauncher(code, kernel_name, nvec,
-                                lead_dim, other_dim, s, params,
-                                inputs, outputs,
-                                ctx.run_ctx.get_ctx().dev_id,
-                                lead_input_num);
   }
 }
 
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
index 86b8b5ad9f1f..12bed7dd09a7 100644
--- a/src/operator/tensor/elemwise_binary_op.cc
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -164,20 +164,20 @@ void ElemwiseBinaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
 
-  const std::string code = std::string("const OpReqType req = ") +
-                           util::to_string(req[0]) +
-                           ";\n"
-                           "#define OP op::" +
-                           OP +
-                           "\n" +
-                           binary_kernel_fwd;
+  std::string code = "const OpReqType req = ";
+  code += util::to_string(req[0]);
+  code += ";\n"
+          "#define OP op::";
+  code += OP;
+  code += "\n";
   const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
 
   const index_t size = outputs[0].Size();
   binary_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_, nullptr},
                                   {outputs[0].dptr_, nullptr} };
 
-  VectorizedKernelRTCLauncher(code, "binary_kernel", nvec,
+  VectorizedKernelRTCLauncher(code, "binary_kernel",
+                              binary_kernel_fwd, nvec,
                               size, 1, s, params,
                               inputs, outputs,
                               ctx.run_ctx.get_ctx().dev_id);
@@ -291,15 +291,15 @@ void ElemwiseBinaryRTCBwdUseNone::operator()(const nnvm::NodeAttrs& attrs,
                            ";\n"
                            "const bool write_right_output = " +
                            std::to_string(write_right_output) +
-                           ";\n" +
-                           binary_kernel_bwd_use_none;
+                           ";\n";
   const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
 
   const index_t size = outputs[0].Size();
   binary_kernel_params params = { {inputs[0].dptr_, nullptr, nullptr},
                                   {outputs[0].dptr_, outputs[1].dptr_} };
 
-  VectorizedKernelRTCLauncher(code, "binary_kernel_bwd", nvec,
+  VectorizedKernelRTCLauncher(code, "binary_kernel_bwd",
+                              binary_kernel_bwd_use_none, nvec,
                               size, 1, s, params,
                               inputs, outputs,
                               ctx.run_ctx.get_ctx().dev_id);
@@ -409,8 +409,7 @@ void ElemwiseBinaryRTCBwdUseIn::operator()(const nnvm::NodeAttrs& attrs,
                            "\n"
                            "#define LOP op::" +
                            LOP +
-                           "\n" +
-                           binary_kernel_bwd_use_in;
+                           "\n";
   // Using 64 bit loads to reduce register pressure
   size_t output_type_size = common::mshadow_type_info(outputs[0].type_flag_).size;
   const int nvec = output_type_size <= sizeof(uint64_t)
@@ -421,7 +420,8 @@ void ElemwiseBinaryRTCBwdUseIn::operator()(const nnvm::NodeAttrs& attrs,
   binary_kernel_params params = { {inputs[0].dptr_, inputs[1].dptr_, inputs[2].dptr_},
                                   {outputs[0].dptr_, outputs[1].dptr_} };
 
-  VectorizedKernelRTCLauncher(code, "binary_kernel_bwd", nvec,
+  VectorizedKernelRTCLauncher(code, "binary_kernel_bwd",
+                              binary_kernel_bwd_use_in, nvec,
                               size, 1, s, params,
                               inputs, outputs,
                               ctx.run_ctx.get_ctx().dev_id);
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
index 2ba249acfe87..bb565e5857b5 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -108,8 +108,7 @@ void BinaryScalarRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                            ";\n" +
                            "#define OP op::" +
                            OP +
-                           "\n" +
-                           binary_scalar_kernel_fwd;
+                           "\n";
   const int nvec = common::mshadow_type_info(outputs[0].type_flag_).size == 8 ? 2 : 4;
 
   const index_t size = outputs[0].Size();
@@ -117,7 +116,8 @@ void BinaryScalarRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                                          {outputs[0].dptr_},
                                          alpha };
 
-  VectorizedKernelRTCLauncher(code, "binary_scalar_kernel", nvec,
+  VectorizedKernelRTCLauncher(code, "binary_scalar_kernel",
+                              binary_scalar_kernel_fwd, nvec,
                               size, 1, s, params,
                               inputs, outputs,
                               ctx.run_ctx.get_ctx().dev_id);
@@ -199,7 +199,7 @@ __global__ void binary_scalar_kernel_bwd(const binary_scalar_kernel_params param
                                 OP(input,
                                    static_cast<typename type_util::mixed_type<typename IType::type,
                                                                               typename OType::type>
-                                               ::type>(params.scalar));
+                                               ::type>(params.scalar)));
 
       if (req == OpReqType::kAddTo) {
         // temp2 may have a wider type than either temp
@@ -234,8 +234,7 @@ void BinaryScalarRTCBackward::operator()(const nnvm::NodeAttrs& attrs,
                            ";\n"
                            "#define OP op::" +
                            OP +
-                           "\n" +
-                           binary_scalar_kernel_bwd;
+                           "\n";
   const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
 
   const index_t size = outputs[0].Size();
@@ -243,7 +242,8 @@ void BinaryScalarRTCBackward::operator()(const nnvm::NodeAttrs& attrs,
                                          {outputs[0].dptr_},
                                          alpha };
 
-  VectorizedKernelRTCLauncher(code, "binary_scalar_kernel_bwd", nvec,
+  VectorizedKernelRTCLauncher(code, "binary_scalar_kernel_bwd",
+                              binary_scalar_kernel_bwd, nvec,
                               size, 1, s, params,
                               inputs, outputs,
                               ctx.run_ctx.get_ctx().dev_id);
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index cc695c826d42..aaf0e8f4dbb1 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -120,15 +120,15 @@ void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
     if (i == 0) {
       const std::string code = std::string("const OpReqType req = ") +
                                util::to_string(req[0]) +
-                               ";\n" +
-                               elementwise_sum_kernel;
+                               ";\n";
       elementwise_sum_params params{};
       params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
       for (int j = 0; j < params.num_inputs; ++j) {
         params.inputs[j] = inputs[i + j].dptr_;
       }
       params.outputs[0] = outputs[0].dptr_;
-      VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel", nvec,
+      VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel",
+                                  elementwise_sum_kernel, nvec,
                                   size, 1, s, params,
                                   inputs, outputs,
                                   ctx.run_ctx.get_ctx().dev_id);
@@ -136,8 +136,7 @@ void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
       /* During subsequent launches we need to
          accumulate into the previous outputs
       */
-      const std::string code = std::string("const OpReqType req = OpReqType::kAddTo;\n") +
-                               elementwise_sum_kernel;
+      const std::string code = "const OpReqType req = OpReqType::kAddTo;\n";
       elementwise_sum_params params{};
       params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
       for (int j = 0; j < params.num_inputs; ++j) {
@@ -145,7 +144,8 @@ void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
       }
       params.outputs[0] = outputs[0].dptr_;
       const std::vector<TBlob> new_inputs(inputs.begin() + i, inputs.end());
-      VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel", nvec,
+      VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel",
+                                  elementwise_sum_kernel, nvec,
                                   size, 1, s, params,
                                   new_inputs, outputs,
                                   ctx.run_ctx.get_ctx().dev_id);
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
index a84d97e9ac89..0f707704ca18 100644
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -102,15 +102,15 @@ void UnaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                            ";\n"
                            "#define OP op::" +
                            OP +
-                           "\n" +
-                           unary_kernel_fwd;
+                           "\n";
   const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
 
   const index_t size = outputs[0].Size();
   unary_kernel_params params = { {inputs[0].dptr_},
                                  {outputs[0].dptr_} };
 
-  VectorizedKernelRTCLauncher(code, "unary_kernel", nvec,
+  VectorizedKernelRTCLauncher(code, "unary_kernel",
+                              unary_kernel_fwd, nvec,
                               size, 1, s, params,
                               inputs, outputs,
                               ctx.run_ctx.get_ctx().dev_id);
diff --git a/src/operator/tensor/reduce_rtc.cc b/src/operator/tensor/reduce_rtc.cc
index b14a636b0469..d7544ffdd943 100644
--- a/src/operator/tensor/reduce_rtc.cc
+++ b/src/operator/tensor/reduce_rtc.cc
@@ -292,8 +292,10 @@ void RTCReduceImpl(Stream<gpu> *s, const TBlob& small, const bool addto,
   const auto &function_code = (lhs == nullptr)
                             ? reduce_function_code
                             : reduce_function_use_input_code;
-  auto reduce_kernel_func = get_function(code + function_code + reduce_kernel_code,
-                                         "reduce_kernel", dev_id);
+  auto reduce_kernel_func = get_function(code + function_code,
+                                         "reduce_kernel",
+                                         reduce_kernel_code,
+                                         dev_id);
   launch(reduce_kernel_func, config.kernel_1.gridDim,
          config.kernel_1.blockDim,
          config.kernel_1.shMemSize, s, &args);
@@ -306,8 +308,10 @@ void RTCReduceImpl(Stream<gpu> *s, const TBlob& small, const bool addto,
     args.emplace_back(&small_dptr);
     args.emplace_back(&small.dptr_);
 
-    auto reduce_lines_kernel_func = get_function(code + reduce_lines_kernel_code,
-                                                 "reduce_lines_kernel", dev_id);
+    auto reduce_lines_kernel_func = get_function(code,
+                                                 "reduce_lines_kernel",
+                                                 reduce_lines_kernel_code,
+                                                 dev_id);
     launch(reduce_lines_kernel_func, config.kernel_2.gridSize,
            config.kernel_2.blockSize, 0, s, &args);
   }
@@ -392,8 +396,10 @@ void RTCReduceM1Impl(Stream<gpu> *s, const TBlob &small, const TBlob &big,
   args.emplace_back(&small.dptr_);
   args.emplace_back(&param);
 
-  auto reduce_kernel_M1_func = get_function(code + reduce_kernel_M1_code,
-                                            "reduce_kernel_M1", dev_id);
+  auto reduce_kernel_M1_func = get_function(code,
+                                            "reduce_kernel_M1",
+                                            reduce_kernel_M1_code,
+                                            dev_id);
   launch(reduce_kernel_M1_func, config.kernel_1.gridDim,
          config.kernel_1.blockDim,
          0, s, &args);

From ff545f3b2c3f08f03c3051de9d3c6a1672ade34f Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 6 Jul 2020 15:00:28 -0700
Subject: [PATCH 41/64] Fix lint and remove rest of mentions of ENABLE_RTC

---
 ci/build_windows.py                                 | 6 ------
 ci/docker/runtime_functions.sh                      | 1 -
 config/darwin.cmake                                 | 1 -
 config/linux.cmake                                  | 1 -
 config/linux_gpu.cmake                              | 1 -
 make/config.mk                                      | 3 ---
 make/config_jetson.mk                               | 3 ---
 make/osx.mk                                         | 3 ---
 make/staticbuild/darwin_cpu.mk                      | 3 ---
 make/staticbuild/darwin_mkl.mk                      | 3 ---
 make/staticbuild/linux_cpu.mk                       | 3 ---
 make/staticbuild/linux_cu100.mk                     | 3 ---
 make/staticbuild/linux_cu101.mk                     | 3 ---
 make/staticbuild/linux_cu102.mk                     | 3 ---
 make/staticbuild/linux_cu92.mk                      | 3 ---
 make/staticbuild/linux_native.mk                    | 3 ---
 python/mxnet/runtime.py                             | 2 +-
 src/common/cuda/rtc.cc                              | 6 ++++--
 src/operator/tensor/elemwise_binary_broadcast_op.cc | 1 -
 19 files changed, 5 insertions(+), 47 deletions(-)

diff --git a/ci/build_windows.py b/ci/build_windows.py
index c8d3af515b5a..0a195b50f77a 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -61,7 +61,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -76,7 +75,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -91,7 +89,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=mkl '
@@ -106,7 +103,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=mkl '
@@ -121,7 +117,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
-        '-DENABLE_CUDA_RTC=ON '
         '-DUSE_OPENCV=ON  '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -136,7 +131,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
-        '-DENABLE_CUDA_RTC=ON '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 38404cd669f4..307f481f81b1 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -194,7 +194,6 @@ build_jetson() {
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="5.2" \
-        -DENABLE_CUDA_RTC=OFF \
         -DUSE_OPENCV=OFF \
         -DUSE_OPENMP=ON \
         -DUSE_LAPACK=OFF \
diff --git a/config/darwin.cmake b/config/darwin.cmake
index a65509f0ba1c..59f031e49f01 100644
--- a/config/darwin.cmake
+++ b/config/darwin.cmake
@@ -126,5 +126,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
-set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
 set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
diff --git a/config/linux.cmake b/config/linux.cmake
index 84eecc2e9701..ff338231e277 100644
--- a/config/linux.cmake
+++ b/config/linux.cmake
@@ -125,5 +125,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
-set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
 set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
index 0dad43332978..442ac6cb3578 100644
--- a/config/linux_gpu.cmake
+++ b/config/linux_gpu.cmake
@@ -125,5 +125,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
-set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
 set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
diff --git a/make/config.mk b/make/config.mk
index 3d8e974a31c1..1e8ae9be5ba4 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -77,9 +77,6 @@ USE_CUDA = 0
 # USE_CUDA_PATH = /usr/local/cuda
 USE_CUDA_PATH = NONE
 
-# whether to enable CUDA runtime compilation
-ENABLE_CUDA_RTC = 1
-
 # whether use CuDNN R3 library
 USE_CUDNN = 0
 
diff --git a/make/config_jetson.mk b/make/config_jetson.mk
index 7de6eff7b6b5..cc7302767198 100644
--- a/make/config_jetson.mk
+++ b/make/config_jetson.mk
@@ -77,9 +77,6 @@ USE_CUDA_PATH = /usr/local/cuda
 # CUDA_ARCH setting
 CUDA_ARCH = -gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_72,code=sm_72
 
-# whether to enable CUDA runtime compilation
-ENABLE_CUDA_RTC = 0
-
 # whether use CuDNN R3 library
 USE_CUDNN = 1
 
diff --git a/make/osx.mk b/make/osx.mk
index 25f3ba6df55b..55a44e9b698c 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -68,9 +68,6 @@ USE_CUDA = 0
 # USE_CUDA_PATH = /usr/local/cuda
 USE_CUDA_PATH = NONE
 
-# whether to enable CUDA runtime compilation
-ENABLE_CUDA_RTC = 1
-
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 
diff --git a/make/staticbuild/darwin_cpu.mk b/make/staticbuild/darwin_cpu.mk
index 1859936f180e..eb15d6498c1e 100644
--- a/make/staticbuild/darwin_cpu.mk
+++ b/make/staticbuild/darwin_cpu.mk
@@ -75,9 +75,6 @@ USE_CUDNN = 0
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
 # CUDA_ARCH :=
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 0
-
 # use openmp for parallelization
 USE_OPENMP = 0
 USE_OPERATOR_TUNING = 1
diff --git a/make/staticbuild/darwin_mkl.mk b/make/staticbuild/darwin_mkl.mk
index 1859936f180e..eb15d6498c1e 100644
--- a/make/staticbuild/darwin_mkl.mk
+++ b/make/staticbuild/darwin_mkl.mk
@@ -75,9 +75,6 @@ USE_CUDNN = 0
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
 # CUDA_ARCH :=
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 0
-
 # use openmp for parallelization
 USE_OPENMP = 0
 USE_OPERATOR_TUNING = 1
diff --git a/make/staticbuild/linux_cpu.mk b/make/staticbuild/linux_cpu.mk
index 1cf389ae4a57..b7c0e1b2432e 100644
--- a/make/staticbuild/linux_cpu.mk
+++ b/make/staticbuild/linux_cpu.mk
@@ -75,9 +75,6 @@ USE_CUDNN = 0
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
 # CUDA_ARCH :=
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 0
-
 # use openmp for parallelization
 USE_OPENMP = 1
 USE_OPERATOR_TUNING = 1
diff --git a/make/staticbuild/linux_cu100.mk b/make/staticbuild/linux_cu100.mk
index 855485c5b6df..7a633521da7a 100644
--- a/make/staticbuild/linux_cu100.mk
+++ b/make/staticbuild/linux_cu100.mk
@@ -86,9 +86,6 @@ USE_NCCL = 1
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
 # CUDA_ARCH :=
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 1
-
 USE_NVTX=1
 
 # use openmp for parallelization
diff --git a/make/staticbuild/linux_cu101.mk b/make/staticbuild/linux_cu101.mk
index 7bbde85bee11..cd24efcb01dc 100644
--- a/make/staticbuild/linux_cu101.mk
+++ b/make/staticbuild/linux_cu101.mk
@@ -86,9 +86,6 @@ USE_NCCL = 1
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
 # CUDA_ARCH :=
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 1
-
 USE_NVTX=1
 
 # use openmp for parallelization
diff --git a/make/staticbuild/linux_cu102.mk b/make/staticbuild/linux_cu102.mk
index 963842a19cff..b9ee1bdbb988 100644
--- a/make/staticbuild/linux_cu102.mk
+++ b/make/staticbuild/linux_cu102.mk
@@ -86,9 +86,6 @@ USE_NCCL = 1
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
 # CUDA_ARCH :=
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 1
-
 USE_NVTX=1
 
 # use openmp for parallelization
diff --git a/make/staticbuild/linux_cu92.mk b/make/staticbuild/linux_cu92.mk
index 2cbbdd25eeaf..a3ee812a0ab2 100644
--- a/make/staticbuild/linux_cu92.mk
+++ b/make/staticbuild/linux_cu92.mk
@@ -86,9 +86,6 @@ USE_NCCL = 1
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
 # CUDA_ARCH :=
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 1
-
 USE_NVTX=1
 
 # use openmp for parallelization
diff --git a/make/staticbuild/linux_native.mk b/make/staticbuild/linux_native.mk
index 348a659cd9e2..c80142adc3d1 100644
--- a/make/staticbuild/linux_native.mk
+++ b/make/staticbuild/linux_native.mk
@@ -75,9 +75,6 @@ USE_CUDNN = 0
 # For CUDA < 6.0, comment the *_50 lines for compatibility.
 # CUDA_ARCH :=
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 0
-
 # use openmp for parallelization
 USE_OPENMP = 1
 USE_OPERATOR_TUNING = 1
diff --git a/python/mxnet/runtime.py b/python/mxnet/runtime.py
index 27500e7eb772..c80742e93e3a 100644
--- a/python/mxnet/runtime.py
+++ b/python/mxnet/runtime.py
@@ -37,7 +37,7 @@
     True
 
     print(features)
-    [✖ CUDA, ✖ CUDNN, ✖ NCCL, ✖ CUDA_RTC, ✖ TENSORRT, ✔ CPU_SSE, ✔ CPU_SSE2, ✔ CPU_SSE3,
+    [✖ CUDA, ✖ CUDNN, ✖ NCCL, ✖ TENSORRT, ✔ CPU_SSE, ✔ CPU_SSE2, ✔ CPU_SSE3,
     ✔ CPU_SSE4_1, ✔ CPU_SSE4_2, ✖ CPU_SSE4A, ✔ CPU_AVX, ✖ CPU_AVX2, ✔ OPENMP, ✖ SSE,
     ✔ F16C, ✔ JEMALLOC, ✔ BLAS_OPEN, ✖ BLAS_ATLAS, ✖ BLAS_MKL, ✖ BLAS_APPLE, ✔ LAPACK,
     ✖ MKLDNN, ✔ OPENCV, ✖ CAFFE, ✖ DIST_KVSTORE, ✖ CXX14, ✖ INT64_TENSOR_SIZE,
diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index d0b84495f759..ee1c7b594067 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -110,8 +110,10 @@ CUfunction get_function(const std::string &parameters,
   // Per-gpu-architecture compiled kernel cache with jit-compiled function for each device context
   static std::unordered_map<int32_t, KernelCache> compiled_kernels;
   int sm_arch = SMArch(dev_id);
-  KernelCache& compiled_kernels_this_arch = compiled_kernels[sm_arch];       // make null map as needed
-  KernelInfo& kinfo = compiled_kernels_this_arch[parameters + kernel_name];  // make KernelInfo as needed
+  // make null map as needed
+  KernelCache& compiled_kernels_this_arch = compiled_kernels[sm_arch];
+  // make KernelInfo as needed
+  KernelInfo& kinfo = compiled_kernels_this_arch[parameters + kernel_name];
   if (kinfo.ptx.size() == 0) {
     // It's the first time we've seen this kernel, so we need to generate the ptx and mangled_name.
     static std::string common_header =
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index 4a84ae5f8f40..b89ad92a051a 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -344,7 +344,6 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                                   ctx.run_ctx.get_ctx().dev_id,
                                   lead_input_num);
     }
-
   }
 }
 

From 503046ba5fd3c1567ba87bff5c9c454eac36f9c9 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 6 Jul 2020 15:30:20 -0700
Subject: [PATCH 42/64] Jetson with RTC

---
 ci/docker/Dockerfile.build.jetson | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 93fe5e0a5b0d..92c1df75ed07 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -42,7 +42,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
  && rm -rf /var/lib/apt/lists/*
 
 # cmake on Ubuntu 18.04 is too old
-RUN python3 -m pip install cmake
+# cmake 3.17.3 does not correctly set the host compiler for NVCC, see https://gitlab.kitware.com/cmake/cmake/-/issues/20826
+RUN python3 -m pip install cmake==3.17.1
 
 # ccache on Ubuntu 18.04 is too old to support Cuda correctly
 COPY install/deb_ubuntu_ccache.sh /work/
@@ -73,6 +74,18 @@ RUN aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-r
     apt-get install -y cuda-cross-aarch64 cuda-cross-aarch64-10-0 && \
     rm -rf /var/lib/apt/lists/*
 
+RUN aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-l4t-10-0-local-10.0.326_1.0-1_arm64.deb && \
+    mkdir cuda_repo_l4t && \
+    dpkg -x cuda-repo-l4t-10-0-local-10.0.326_1.0-1_arm64.deb cuda_repo_l4t && \
+    rm cuda-repo-l4t-10-0-local-10.0.326_1.0-1_arm64.deb && \
+    mkdir cuda_l4t && \
+    cd cuda_l4t && \
+    for f in ../cuda_repo_l4t/var/cuda-repo-10-0-local-10.0.326/*.deb; do dpkg -x $f .; done && \
+    cp -r usr/local/cuda-10.0/targets/aarch64-linux/ /usr/local/cuda-10.0/targets && \
+    cp /usr/local/cuda/targets/aarch64-linux/lib/stubs/libcuda.so /usr/aarch64-linux-gnu/lib && \
+    cd - && \
+    rm -rf cuda_repo_l4t cuda_l4t
+
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/

From 611276c20871541a9412914673a52a38a9837089 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 7 Jul 2020 14:44:38 -0700
Subject: [PATCH 43/64] Fix the aws s3 command

---
 ci/docker/Dockerfile.build.jetson | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 92c1df75ed07..9ab6e0de7160 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -74,7 +74,7 @@ RUN aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-r
     apt-get install -y cuda-cross-aarch64 cuda-cross-aarch64-10-0 && \
     rm -rf /var/lib/apt/lists/*
 
-RUN aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-l4t-10-0-local-10.0.326_1.0-1_arm64.deb && \
+RUN aws s3 cp s3://mxnet-ci-prod-private-slave-data/nvidia/sdkm_downloads/cuda-repo-l4t-10-0-local-10.0.326_1.0-1_arm64.deb . && \
     mkdir cuda_repo_l4t && \
     dpkg -x cuda-repo-l4t-10-0-local-10.0.326_1.0-1_arm64.deb cuda_repo_l4t && \
     rm cuda-repo-l4t-10-0-local-10.0.326_1.0-1_arm64.deb && \

From 724571bd481f13efa35e954690866181f7a0785a Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 7 Jul 2020 15:13:03 -0700
Subject: [PATCH 44/64] Debugging Windows failure

---
 tests/python/unittest/test_gluon.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 47ef86ff58ed..31ab30aa8bdb 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -714,6 +714,18 @@ def _syncParameters(bn1, bn2, ctx):
 
         input1.attach_grad()
         inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
+
+        if cuda:
+            print("input1:", input1)
+            print("input2:", input2)
+            print("gamma1:", bn1.gamma.data())
+            print("gamma2:", bn2.gamma.data())
+            print("beta1:", bn1.beta.data())
+            print("beta2:", bn2.beta.data())
+            print("running_mean1:", bn1.running_mean.data())
+            print("running_mean2:", bn2.running_mean.data())
+            print("running_var1:", bn1.running_var.data())
+            print("running_var2:", bn2.running_var.data())
         for xi in inputs2:
             xi.attach_grad()
 
@@ -754,6 +766,12 @@ def _syncParameters(bn1, bn2, ctx):
 
         atol = 1e-2
         rtol = 1e-2
+        if cuda:
+            print("output1:", output1)
+            print("target_output:", target_output)
+            print("output2:", output2)
+            print("data_mean:", data_mean)
+            print("data_var:", data_var)
         assert_almost_equal(output1.asnumpy(), target_output.asnumpy(),
                             atol=atol, rtol=rtol)
         assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),

From c402a22ed8a67c93b35e81a82cb1de07044b83ad Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 9 Jul 2020 08:54:43 -0700
Subject: [PATCH 45/64] More debugging of Windows failure

---
 src/common/cuda/rtc.cc              |  2 +-
 tests/python/unittest/test_gluon.py | 40 ++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index ee1c7b594067..19f5aa6a3c3f 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -130,7 +130,7 @@ CUfunction get_function(const std::string &parameters,
     std::string code_with_header = common_header + parameters + code;
     // If verbose mode, output kernel source, though not including the common header
     if (dmlc::GetEnv("MXNET_RTC_VERBOSE", false)) {
-      LOG(INFO) << "\n" << std::string(80, '-') << "\n" << code;
+      LOG(INFO) << "\n" << std::string(80, '-') << "\n" << (parameters + code);
     }
     if (compiled_kernels_this_arch.size() == CACHESIZE_WARN_THRESHOLD + 1 &&
         dmlc::GetEnv("MXNET_RTC_SIZE_WARNING", true)) {
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 31ab30aa8bdb..7414e72622d1 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -692,6 +692,32 @@ def _syncParameters(bn1, bn2, ctx):
             bn2.running_mean.set_data(bn1.running_mean.data(ctx))
             bn2.running_var.set_data(bn1.running_var.data(ctx))
 
+        if cuda:
+            input = mx.nd.array(
+                    [[0.09598747, 0.11375386],
+                     [0.25655177, 0.25434464],
+                     [0.94226855, 0.18708304],
+                     [0.4764183,  0.8131366 ],
+                     [0.21693859, 0.6864563 ],
+                     [0.5725503,  0.8149943 ],
+                     [0.13844976, 0.75451255],
+                     [0.19600964, 0.1027916 ],
+                     [0.5756048,  0.41257784],
+                     [0.56359816, 0.46988449],
+                     [0.01285437, 0.6327416 ],
+                     [0.4622506,  0.89027345],
+                     [0.1828535,  0.00556029],
+                     [0.11348299, 0.04499052],
+                     [0.7939917,  0.32591197],
+                     [0.3193446,  0.75737333],
+                     [0.7434559,  0.6272748 ],
+                     [0.09003141, 0.6050287 ],
+                     [0.65154195, 0.9893612 ],
+                     [0.20389462, 0.35144925],
+                     [0.07909038, 0.6846926 ],
+                     [0.10114695, 0.62155235],
+                     [0.769926,   0.92900974],
+                     [0.2362933,  0.25065088]])
         input1 = input.copy()
         input2 = input.copy()
 
@@ -750,10 +776,14 @@ def _syncParameters(bn1, bn2, ctx):
 
         data_mean = data.mean(
             axis=axis, exclude=True, keepdims=True)
-        data_var = (data - data_mean).square().mean(axis=axis,
-                                                    exclude=True, keepdims=True)
+        diff = data - data_mean
+        diffsq = diff.square()
+        data_var = diffsq.mean(axis=axis,
+                               exclude=True, keepdims=True)
+        data_varpluseps = data_var + epsilon
+        sqrtdatavarpluseps = data_varpluseps.sqrt()
 
-        target_output = (data - data_mean) / (data_var + epsilon).sqrt()
+        target_output = diff / sqrtdatavarpluseps
 
         # squeeze data_mean and data_var
         data_mean_flat = data_mean.squeeze()
@@ -771,7 +801,11 @@ def _syncParameters(bn1, bn2, ctx):
             print("target_output:", target_output)
             print("output2:", output2)
             print("data_mean:", data_mean)
+            print("diff:", diff)
+            print("diffsq:", diffsq)
             print("data_var:", data_var)
+            print("data_varpluseps:", data_varpluseps)
+            print("sqrtdatavarpluseps:", sqrtdatavarpluseps)
         assert_almost_equal(output1.asnumpy(), target_output.asnumpy(),
                             atol=atol, rtol=rtol)
         assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),

From 98acaefe0cc94cde4f518190884acd0da39bed69 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 10 Jul 2020 10:19:00 -0700
Subject: [PATCH 46/64] Debug

---
 .../tensor/elemwise_binary_broadcast_op.cc        | 15 +++++++++++++++
 tests/python/unittest/test_gluon.py               |  2 ++
 2 files changed, 17 insertions(+)

diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index b89ad92a051a..8ef010430bf9 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -244,6 +244,8 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                                            const std::vector<TBlob>& inputs,
                                            const std::vector<OpReqType>& req,
                                            const std::vector<TBlob>& outputs) {
+  //                                                                                                     Failing lint to not trigger the full CI
+  std::cout << "BinaryBroadcastRTCCompute " << OP << std::endl;
   using namespace mxnet::common::cuda::rtc;
   if (outputs[0].shape_.Size() == 0U) return;
   if (req[0] == kNullOp) return;
@@ -252,12 +254,17 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
+  std::cout << ndim << std::endl;
+  std::cout << new_lshape << std::endl;
+  std::cout << new_rshape << std::endl;
+  std::cout << new_oshape << std::endl;
   // Pad the ndim
   BROADCAST_NDIM_SWITCH(ndim, NDim, {
       if (ndim != 0) {
         ndim = NDim;
       }
   });
+  std::cout << ndim << std::endl;
 
   if (!ndim) {
     ElemwiseBinaryRTCCompute {OP}(attrs, ctx, inputs, req, outputs);
@@ -274,6 +281,7 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
     const int nvec = output_type_size <= sizeof(uint64_t)
                        ? (sizeof(uint64_t) / output_type_size)
                        : 1;
+    std::cout << nvec << std::endl;
     binary_broadcast_params params{};
     params.inputs[0] = lhs.dptr_;
     params.inputs[1] = rhs.dptr_;
@@ -282,10 +290,13 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
       params.stride[0][i] = lstride[i];
       params.stride[1][i] = rstride[i];
       params.oshape[i] = new_oshape[i];
+      std::cout << i << " " << params.stride[0][i] << " " << params.stride[1][i] << " " << params.oshape[i] << std::endl;
     }
     params.size[0] = lhs.shape_.Size();
     params.size[1] = rhs.shape_.Size();
 
+    std::cout << params.size[0] << " " << params.size[1] << std::endl;
+
     index_t lead_dim = 1;
     for (int i = ndim - 1; i >= 0; --i) {
       /* Find the first non-1 dimension
@@ -296,7 +307,9 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
         break;
       }
     }
+    std::cout << lead_dim << std::endl;
     const index_t other_dim = output.shape_.Size() / lead_dim;
+    std::cout << other_dim << std::endl;
 
     int first_different = -1;
     int common_shape = 1;
@@ -308,6 +321,8 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
         break;
       }
     }
+    std::cout << first_different << std::endl;
+    std::cout << common_shape << std::endl;
 
     int lead_input_num = 0;
     std::string code = std::string("const OpReqType req = ") +
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 7414e72622d1..18376618bfc3 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -844,6 +844,8 @@ def _syncParameters(bn1, bn2, ctx):
                 _check_batchnorm_result(mx.nd.random.uniform(shape=shape,
                                                              ctx=mx.cpu(0)),
                                         num_devices=ndev, cuda=cuda)
+                if cuda:
+                    return
 
 
 @with_seed()

From b4a6794c283be72a16f101ab623e67c1b1b20a7b Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 16 Jul 2020 16:40:36 -0700
Subject: [PATCH 47/64] Fix the issue on Windows (long -> long long for 8B)

---
 src/common/cuda/rtc/vectorization-inl.h       |  6 +-
 .../tensor/elemwise_binary_broadcast_op.cc    | 15 -----
 tests/python/unittest/test_gluon.py           | 60 +------------------
 3 files changed, 6 insertions(+), 75 deletions(-)

diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index 40fa84422299..bb328e8929d1 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -64,17 +64,17 @@ struct VectorType<4> {
 
 template <>
 struct VectorType<8> {
-  using type = ulong1;
+  using type = ulonglong1;
 };
 
 template <>
 struct VectorType<16> {
-  using type = ulong2;
+  using type = ulonglong2;
 };
 
 template <>
 struct VectorType<32> {
-  using type = ulong4;
+  using type = ulonglong4;
 };
 
 /* \brief Helper class that enables storing multiple values of type DType
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index 8ef010430bf9..b89ad92a051a 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -244,8 +244,6 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
                                            const std::vector<TBlob>& inputs,
                                            const std::vector<OpReqType>& req,
                                            const std::vector<TBlob>& outputs) {
-  //                                                                                                     Failing lint to not trigger the full CI
-  std::cout << "BinaryBroadcastRTCCompute " << OP << std::endl;
   using namespace mxnet::common::cuda::rtc;
   if (outputs[0].shape_.Size() == 0U) return;
   if (req[0] == kNullOp) return;
@@ -254,17 +252,12 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
-  std::cout << ndim << std::endl;
-  std::cout << new_lshape << std::endl;
-  std::cout << new_rshape << std::endl;
-  std::cout << new_oshape << std::endl;
   // Pad the ndim
   BROADCAST_NDIM_SWITCH(ndim, NDim, {
       if (ndim != 0) {
         ndim = NDim;
       }
   });
-  std::cout << ndim << std::endl;
 
   if (!ndim) {
     ElemwiseBinaryRTCCompute {OP}(attrs, ctx, inputs, req, outputs);
@@ -281,7 +274,6 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
     const int nvec = output_type_size <= sizeof(uint64_t)
                        ? (sizeof(uint64_t) / output_type_size)
                        : 1;
-    std::cout << nvec << std::endl;
     binary_broadcast_params params{};
     params.inputs[0] = lhs.dptr_;
     params.inputs[1] = rhs.dptr_;
@@ -290,13 +282,10 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
       params.stride[0][i] = lstride[i];
       params.stride[1][i] = rstride[i];
       params.oshape[i] = new_oshape[i];
-      std::cout << i << " " << params.stride[0][i] << " " << params.stride[1][i] << " " << params.oshape[i] << std::endl;
     }
     params.size[0] = lhs.shape_.Size();
     params.size[1] = rhs.shape_.Size();
 
-    std::cout << params.size[0] << " " << params.size[1] << std::endl;
-
     index_t lead_dim = 1;
     for (int i = ndim - 1; i >= 0; --i) {
       /* Find the first non-1 dimension
@@ -307,9 +296,7 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
         break;
       }
     }
-    std::cout << lead_dim << std::endl;
     const index_t other_dim = output.shape_.Size() / lead_dim;
-    std::cout << other_dim << std::endl;
 
     int first_different = -1;
     int common_shape = 1;
@@ -321,8 +308,6 @@ void BinaryBroadcastRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
         break;
       }
     }
-    std::cout << first_different << std::endl;
-    std::cout << common_shape << std::endl;
 
     int lead_input_num = 0;
     std::string code = std::string("const OpReqType req = ") +
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 18376618bfc3..47ef86ff58ed 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -692,32 +692,6 @@ def _syncParameters(bn1, bn2, ctx):
             bn2.running_mean.set_data(bn1.running_mean.data(ctx))
             bn2.running_var.set_data(bn1.running_var.data(ctx))
 
-        if cuda:
-            input = mx.nd.array(
-                    [[0.09598747, 0.11375386],
-                     [0.25655177, 0.25434464],
-                     [0.94226855, 0.18708304],
-                     [0.4764183,  0.8131366 ],
-                     [0.21693859, 0.6864563 ],
-                     [0.5725503,  0.8149943 ],
-                     [0.13844976, 0.75451255],
-                     [0.19600964, 0.1027916 ],
-                     [0.5756048,  0.41257784],
-                     [0.56359816, 0.46988449],
-                     [0.01285437, 0.6327416 ],
-                     [0.4622506,  0.89027345],
-                     [0.1828535,  0.00556029],
-                     [0.11348299, 0.04499052],
-                     [0.7939917,  0.32591197],
-                     [0.3193446,  0.75737333],
-                     [0.7434559,  0.6272748 ],
-                     [0.09003141, 0.6050287 ],
-                     [0.65154195, 0.9893612 ],
-                     [0.20389462, 0.35144925],
-                     [0.07909038, 0.6846926 ],
-                     [0.10114695, 0.62155235],
-                     [0.769926,   0.92900974],
-                     [0.2362933,  0.25065088]])
         input1 = input.copy()
         input2 = input.copy()
 
@@ -740,18 +714,6 @@ def _syncParameters(bn1, bn2, ctx):
 
         input1.attach_grad()
         inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
-
-        if cuda:
-            print("input1:", input1)
-            print("input2:", input2)
-            print("gamma1:", bn1.gamma.data())
-            print("gamma2:", bn2.gamma.data())
-            print("beta1:", bn1.beta.data())
-            print("beta2:", bn2.beta.data())
-            print("running_mean1:", bn1.running_mean.data())
-            print("running_mean2:", bn2.running_mean.data())
-            print("running_var1:", bn1.running_var.data())
-            print("running_var2:", bn2.running_var.data())
         for xi in inputs2:
             xi.attach_grad()
 
@@ -776,14 +738,10 @@ def _syncParameters(bn1, bn2, ctx):
 
         data_mean = data.mean(
             axis=axis, exclude=True, keepdims=True)
-        diff = data - data_mean
-        diffsq = diff.square()
-        data_var = diffsq.mean(axis=axis,
-                               exclude=True, keepdims=True)
-        data_varpluseps = data_var + epsilon
-        sqrtdatavarpluseps = data_varpluseps.sqrt()
+        data_var = (data - data_mean).square().mean(axis=axis,
+                                                    exclude=True, keepdims=True)
 
-        target_output = diff / sqrtdatavarpluseps
+        target_output = (data - data_mean) / (data_var + epsilon).sqrt()
 
         # squeeze data_mean and data_var
         data_mean_flat = data_mean.squeeze()
@@ -796,16 +754,6 @@ def _syncParameters(bn1, bn2, ctx):
 
         atol = 1e-2
         rtol = 1e-2
-        if cuda:
-            print("output1:", output1)
-            print("target_output:", target_output)
-            print("output2:", output2)
-            print("data_mean:", data_mean)
-            print("diff:", diff)
-            print("diffsq:", diffsq)
-            print("data_var:", data_var)
-            print("data_varpluseps:", data_varpluseps)
-            print("sqrtdatavarpluseps:", sqrtdatavarpluseps)
         assert_almost_equal(output1.asnumpy(), target_output.asnumpy(),
                             atol=atol, rtol=rtol)
         assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
@@ -844,8 +792,6 @@ def _syncParameters(bn1, bn2, ctx):
                 _check_batchnorm_result(mx.nd.random.uniform(shape=shape,
                                                              ctx=mx.cpu(0)),
                                         num_devices=ndev, cuda=cuda)
-                if cuda:
-                    return
 
 
 @with_seed()

From 5eef300ee6b07855d51b010548267776dc9bcb02 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 17 Jul 2020 10:25:37 -0700
Subject: [PATCH 48/64] libcuda.so for Jetson

---
 ci/docker/Dockerfile.build.jetson | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 93c5558d2228..43a8e154f3d5 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -90,6 +90,9 @@ RUN wget https://repo.download.nvidia.com/jetson/x86_64/pool/r32.4/c/cuda/cuda-c
 # while previously it used to store it at /usr/local/cuda/targets/aarch64-linux/lib/stubs
 RUN ln -s /usr/lib/aarch64-linux-gnu/libcublas.so /usr/local/cuda/targets/aarch64-linux/lib/stubs/libcublas.so
 
+# Need the stub for libcuda.so in a place where host linker can find it to link the unit tests
+RUN ln -s /usr/local/cuda/targets/aarch64-linux/lib/stubs/libcuda.so /usr/lib/aarch64-linux-gnu/libcuda.so
+
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/

From b05075bb7af9fcc7d880e9f7c03e69bd4b442721 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 17 Jul 2020 13:37:34 -0700
Subject: [PATCH 49/64] Enable debug information for RTC kernels and cleaning
 debug ptx dump

---
 src/common/cuda/rtc.cc | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 19f5aa6a3c3f..01020cdb7ecc 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -139,10 +139,6 @@ CUfunction get_function(const std::string &parameters,
                    << ".  Set MXNET_RTC_SIZE_WARNING=0 to quiet this warning.";
     }
     nvrtcProgram program;
-    std::ofstream f("debug.log");
-    f << code_with_header;
-    f.close();
-
     NVRTC_CALL(nvrtcCreateProgram(&program,                                  // prog
                                   &code_with_header[0],                      // buffer
                                   (kernel_name + "_kernel.cu").c_str(),      // name
@@ -152,15 +148,25 @@ CUfunction get_function(const std::string &parameters,
 
     std::string gpu_arch_arg = "--gpu-architecture=compute_" + std::to_string(sm_arch);
     const char *opts[] = {gpu_arch_arg.c_str(),
+#if NDEBUG == 0
+                          "-G",
+#endif
                           "--std=c++11"};
     const std::string kernel_name_demangled = kernel_name;
     NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
 
-    nvrtcResult compileResult = nvrtcCompileProgram(program,  // prog
-                                                    2,        // num options
-                                                    opts);    // options
+    nvrtcResult compileResult = nvrtcCompileProgram(program,                         // prog
+                                                    sizeof(opts) / sizeof(opts[0]),  // num options
+                                                    opts);                           // options
+    static const std::string dump_file = "mxnet_rtc_debug_code.log";
+    if (compileResult != NVRTC_SUCCESS) {
+      std::ofstream f(dump_file);
+      f << code_with_header;
+      f.close();
+    }
     CHECK_EQ(compileResult, NVRTC_SUCCESS)
-        << "NVRTC Compilation failed. Please set environment variable MXNET_USE_FUSION to 0.\n"
+        << "NVRTC Compilation failed.\n"
+        << "The generated code was stored in " << dump_file << "\n"
         << GetCompileLog(program);
 
     kinfo.ptx = GetPtx(program);
@@ -184,11 +190,20 @@ CUfunction get_function(const std::string &parameters,
     CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cu_device));
     // Jit-compile ptx for the driver's current context
     CUmodule module;
-    std::ofstream f("debug.ptx");
-    f << kinfo.ptx;
-    f.close();
 
-    CUDA_DRIVER_CALL(cuModuleLoadData(&module, kinfo.ptx.c_str()));
+#if NDEBUG == 0
+    intptr_t debug_info = 1;
+    intptr_t line_info = 1;
+#else
+    intptr_t debug_info = 0;
+    intptr_t line_info = 0;
+#endif
+
+    std::cout << debug_info << " " << line_info << std::endl;
+    CUjit_option jit_opts[] = {CU_JIT_GENERATE_DEBUG_INFO, CU_JIT_GENERATE_LINE_INFO};
+    void* jit_opt_values[] = {(void*)debug_info, (void*)line_info};
+
+    CUDA_DRIVER_CALL(cuModuleLoadDataEx(&module, kinfo.ptx.c_str(), 2, jit_opts, jit_opt_values));
     CUDA_DRIVER_CALL(cuModuleGetFunction(&kinfo.functions[dev_id],
                                          module,
                                          kinfo.mangled_name.c_str()));

From 55337df80017e3292c9800a075d060fa5367edbc Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 20 Jul 2020 08:49:57 -0700
Subject: [PATCH 50/64] Fix lint

---
 src/common/cuda/rtc.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 01020cdb7ecc..4b5208faddbf 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -201,7 +201,8 @@ CUfunction get_function(const std::string &parameters,
 
     std::cout << debug_info << " " << line_info << std::endl;
     CUjit_option jit_opts[] = {CU_JIT_GENERATE_DEBUG_INFO, CU_JIT_GENERATE_LINE_INFO};
-    void* jit_opt_values[] = {(void*)debug_info, (void*)line_info};
+    void* jit_opt_values[] = {reinterpret_cast<void*>(debug_info),
+                              reinterpret_cast<void*>(line_info)};
 
     CUDA_DRIVER_CALL(cuModuleLoadDataEx(&module, kinfo.ptx.c_str(), 2, jit_opts, jit_opt_values));
     CUDA_DRIVER_CALL(cuModuleGetFunction(&kinfo.functions[dev_id],

From 50388d8a42a2fcf648dd017d94fec297a3a9cd0c Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 20 Jul 2020 11:59:19 -0700
Subject: [PATCH 51/64] Try without linking the stub of libcuda.so to different
 place in Jetson

---
 CMakeLists.txt                    | 12 ++++--------
 ci/docker/Dockerfile.build.jetson |  3 ---
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e227009eba5..746dfbd79f9c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -587,15 +587,11 @@ if(USE_CUDA)
 
   string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}")
 
-  find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand
-    OPTIONAL_COMPONENTS nvToolsExt nvrtc)
+  find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand nvrtc cuda_driver
+    OPTIONAL_COMPONENTS nvToolsExt)
 
-  list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand)
-  if(CUDA_nvrtc_LIBRARY)
-    list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda)
-  else()
-    message(FATAL_ERROR "Failed to find NVRTC. CMake will exit." )
-  endif()
+  list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand
+                                CUDA::nvrtc CUDA::cuda_driver)
   list(APPEND SOURCE ${CUDA})
   add_definitions(-DMXNET_USE_CUDA=1)
 
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index 43a8e154f3d5..93c5558d2228 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -90,9 +90,6 @@ RUN wget https://repo.download.nvidia.com/jetson/x86_64/pool/r32.4/c/cuda/cuda-c
 # while previously it used to store it at /usr/local/cuda/targets/aarch64-linux/lib/stubs
 RUN ln -s /usr/lib/aarch64-linux-gnu/libcublas.so /usr/local/cuda/targets/aarch64-linux/lib/stubs/libcublas.so
 
-# Need the stub for libcuda.so in a place where host linker can find it to link the unit tests
-RUN ln -s /usr/local/cuda/targets/aarch64-linux/lib/stubs/libcuda.so /usr/lib/aarch64-linux-gnu/libcuda.so
-
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/

From 3ce89840b09c7e550df25f14220170bd341facae Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 20 Jul 2020 11:33:11 -0700
Subject: [PATCH 52/64] Add docstring

---
 src/common/cuda/rtc.h                   | 18 ++++++++++++++++++
 src/common/cuda/rtc/vectorization-inl.h | 15 +++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/src/common/cuda/rtc.h b/src/common/cuda/rtc.h
index ea95346a0468..126c967a0cb3 100644
--- a/src/common/cuda/rtc.h
+++ b/src/common/cuda/rtc.h
@@ -46,17 +46,35 @@ namespace rtc {
 
 namespace util {
 
+/*! \brief Convert OpReqType to string.
+ *  \param req to convert
+ */
 std::string to_string(OpReqType req);
 
 }  // namespace util
 
 extern std::mutex lock;
 
+/*! \brief Compile and get the GPU kernel. Uses cache in order to
+ *         eliminate the overhead of compilation.
+ *  \param parameters of the kernel (e.g. values of the template arguments, types used)
+ *  \param kernel_name name of the kernel
+ *  \param code used for compilation of the kernel if not found in cache
+ *  \param dev_id id of the device which the kernel will be launched on
+ */
 CUfunction get_function(const std::string &parameters,
                         const std::string &kernel_name,
                         const std::string &code,
                         int dev_id);
 
+/*! \brief Launch a GPU kernel.
+ *  \param function to launch
+ *  \param grid_dim grid dimensions
+ *  \param block_dim block dimensions
+ *  \param shared_mem_bytes amount of dynamic shared memory needed by the kernel
+ *  \param stream used for launching the kernel
+ *  \param args arguments of the kernel
+ */
 void launch(CUfunction function,
             const dim3 grid_dim,
             const dim3 block_dim,
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index bb328e8929d1..2c05a616347b 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -314,6 +314,21 @@ constexpr int vectorized_kernel_thread_num = 512;
 
 }  // namespace
 
+/*! \brief Launcher helper for the kernels using vectorization.
+ *  \param parameters of the kernel (e.g. values of the template arguments)
+ *  \param kernel_name name of the kernel
+ *  \param code used for compilation of the kernel if not found in cache
+ *  \param nvec length of the vector used for loading/storing data
+ *  \param lead_dim size of leading dimension of the tensors
+ *  \param other_dim maximum of the total size of all the other dimensions of the tensors
+ *  \param s stream used to launch the kernel
+ *  \param inputs to the kernel
+ *  \param outputs of the kernel
+ *  \param dev_id id of the devide which the kernel will be launched on
+ *  \param lead_input_num number of input to use for checking alignment
+ *                        (in case only a subset of inputs is used vectorized).
+ *                        Default is 0.
+ */
 template <typename Params>
 void VectorizedKernelRTCLauncher(const std::string &parameters,
                                  const std::string &kernel_name,

From 0bd007ab990053e7110dde2554c4ac592f736508 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 21 Jul 2020 13:36:29 -0700
Subject: [PATCH 53/64] Answering review comments

---
 src/common/cuda/rtc.cc | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 4b5208faddbf..f253d87e6520 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -71,11 +71,9 @@ namespace {
 std::string GetCompileLog(nvrtcProgram program) {
   size_t log_size_including_null;
   NVRTC_CALL(nvrtcGetProgramLogSize(program, &log_size_including_null));
-  // For most std::string implementations, this is probably 1 char bigger than needed.  OK though.
-  std::string log(log_size_including_null, '\0');
+  std::string log(log_size_including_null - 1, '\0');
+  // Room for terminating null character ensured since C++11
   NVRTC_CALL(nvrtcGetProgramLog(program, &log[0]));
-  // Make sure the string reflects the true size (so minus the null terminator).
-  log.resize(log_size_including_null - 1);
   return log;
 }
 
@@ -83,11 +81,9 @@ std::string GetCompileLog(nvrtcProgram program) {
 std::string GetPtx(nvrtcProgram program) {
   size_t ptx_size_including_null;
   NVRTC_CALL(nvrtcGetPTXSize(program, &ptx_size_including_null));
-  // For most std::string implementations, this is probably 1 char bigger than needed.  OK though.
-  std::string ptx(ptx_size_including_null, '\0');
+  std::string ptx(ptx_size_including_null - 1, '\0');
+  // Room for terminating null character ensured since C++11
   NVRTC_CALL(nvrtcGetPTX(program, &ptx[0]));
-  // Make sure the string reflects the true size (so minus the null terminator).
-  ptx.resize(ptx_size_including_null - 1);
   return ptx;
 }
 
@@ -151,7 +147,7 @@ CUfunction get_function(const std::string &parameters,
 #if NDEBUG == 0
                           "-G",
 #endif
-                          "--std=c++11"};
+                          "--std=c++14"};
     const std::string kernel_name_demangled = kernel_name;
     NVRTC_CALL(nvrtcAddNameExpression(program, (kernel_name_demangled).c_str()));
 
@@ -199,7 +195,6 @@ CUfunction get_function(const std::string &parameters,
     intptr_t line_info = 0;
 #endif
 
-    std::cout << debug_info << " " << line_info << std::endl;
     CUjit_option jit_opts[] = {CU_JIT_GENERATE_DEBUG_INFO, CU_JIT_GENERATE_LINE_INFO};
     void* jit_opt_values[] = {reinterpret_cast<void*>(debug_info),
                               reinterpret_cast<void*>(line_info)};

From ee82cd6f3d2d90757f3efeabe69cde13a846adc8 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 21 Jul 2020 14:00:53 -0700
Subject: [PATCH 54/64] Unifying vectorization

---
 src/common/cuda/rtc.cc                      |   2 +-
 src/common/cuda/rtc/forward_functions-inl.h | 117 ++++++--------------
 src/common/cuda/rtc/vectorization-inl.h     |  27 +++++
 3 files changed, 59 insertions(+), 87 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index f253d87e6520..5d2596cc9be9 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -117,11 +117,11 @@ CUfunction get_function(const std::string &parameters,
         type_support_string + "\n" +
         util_string + "\n" +
         special_functions_definitions + '\n' +
+        vectorization_support_string + "\n" +
         function_definitions_util + "\n" +
         function_definitions_binary + "\n" +
         function_definitions_unary + "\n" +
         backward_function_definitions + "\n" +
-        vectorization_support_string + "\n" +
         reducer + "\n";
     std::string code_with_header = common_header + parameters + code;
     // If verbose mode, output kernel source, though not including the common header
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 7dfafc86de9b..6555ad9f24a8 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -63,57 +63,6 @@ __device__ inline half store(const DType input, half* ref) {
   return __float2half(input);
 }
 
-template <int size>
-struct VectorConfig {
-    static_assert(size >= 4, "VectorConfig needs to have size of at least 4B");
-    using IndexType = float;
-};
-
-template <>
-struct VectorConfig<8> {
-    using IndexType = double;
-};
-
-template <>
-struct VectorConfig<16> {
-    using IndexType = double2;
-};
-
-template <>
-struct VectorConfig<32> {
-    using IndexType = double4;
-};
-
-template <typename DType>
-__device__ inline DType add_elem(const DType& x, const DType& y) {
-  return x + y;
-}
-
-template <>
-__device__ inline half add_elem(const half& x, const half& y) {
-  return __float2half(__half2float(x) + __half2float(y));
-}
-
-template <typename DType, int nvec>
-union VectorType {
-    typename VectorConfig<sizeof(DType)*nvec>::IndexType y;
-    DType x[nvec];
-    __device__ VectorType () {};
-    __device__ VectorType (const VectorType<DType, nvec>& y2) {
-        y = y2.y;
-    }
-    __device__ VectorType (const decltype(y) &y2) {
-        y = y2;
-    }
-    __device__ inline VectorType<DType, nvec>& operator+=(const VectorType<DType, nvec>& rhs) {
-      #pragma unroll
-      for (int i = 0; i < nvec; ++i) {
-        x[i] = add_elem(x[i], rhs.x[i]);
-      }
-      return *this;
-    }
-};
-
 template <int ndim>
 struct Shape {
    int x[ndim];
@@ -138,39 +87,35 @@ struct Shape<0> {
 };
 
 template <int nvec, typename DType, int ndim>
-__device__ inline VectorType<DType, nvec> load_index(const DType * input, int i,
-                                                     const Shape<ndim> &shape) {
+__device__ inline vector::VectorizedStorage<DType, nvec> load_index(const DType * input, int i,
+                                                                    const Shape<ndim> &shape) {
+  using V = vector::VectorizedStorage<DType, nvec>;
   if (i < shape.size) {
-    const auto* vector_input = reinterpret_cast<
-                                const typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(
-                                    input + i);
-    VectorType<DType, nvec> ret = {*vector_input};
-    return ret;
+    const auto* vector_input = reinterpret_cast<const typename V::LType *>(input + i);
+    return V(*vector_input);
   } else {
-    VectorType<DType, nvec> ret({0});
-    return ret;
+    return V(0);
   }
 }
 
 template <int nvec, typename DType, int ndim>
-__device__ inline VectorType<DType, nvec> global_load_index(const DType * input, int i,
-                                                            const Shape<ndim> &shape) {
+__device__ inline vector::VectorizedStorage<DType, nvec> global_load_index(const DType * input,
+                    int i, const Shape<ndim> &shape) {
+  using V = vector::VectorizedStorage<DType, nvec>;
   if (i < shape.size) {
-    const auto* vector_input = reinterpret_cast<
-                                const typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(
-                                    input + i);
-    VectorType<DType, nvec> ret = {__ldg(vector_input)};
-    return ret;
+    const auto* vector_input = reinterpret_cast<const typename V::LType *>(input + i);
+    return V(__ldg(vector_input));
   } else {
-    VectorType<DType, nvec> ret({0});
-    return ret;
+    return V(0);
   }
 }
 
 template <int nvec, typename DType, int ndim>
-__device__ inline VectorType<DType, nvec> load_slice(const DType * input, const Shape<ndim>& shape,
-                                                     Shape<ndim> begin, Shape<ndim> end,
-                                                     int offset) {
+__device__ inline vector::VectorizedStorage<DType, nvec> load_slice(const DType * input,
+                                                                    const Shape<ndim>& shape,
+                                                                    Shape<ndim> begin,
+                                                                    Shape<ndim> end,
+                                                                    int offset) {
   int idx[nvec];
 
   Shape<ndim> ref_strides;
@@ -200,20 +145,20 @@ __device__ inline VectorType<DType, nvec> load_slice(const DType * input, const
        ref_idx = ref_idx % stride;
     }
   }
-  VectorType<DType, nvec> ret;
+  vector::VectorizedStorage<DType, nvec> ret;
   #pragma unroll
   for (int j = 0; j < nvec; j++) {
-      ret.x[j] = *(input + idx[j]);
+      ret.scratch_.separate[j] = *(input + idx[j]);
   }
   return ret;
 }
 
 template <int nvec, typename DType, int ndim>
-__device__ inline VectorType<DType, nvec> fast_load_slice(const DType * input,
-                                                          const Shape<ndim>& shape,
-                                                          Shape<ndim> begin,
-                                                          Shape<ndim> end,
-                                                          int offset) {
+__device__ inline vector::VectorizedStorage<DType, nvec> fast_load_slice(const DType * input,
+                                                                         const Shape<ndim>& shape,
+                                                                         Shape<ndim> begin,
+                                                                         Shape<ndim> end,
+                                                                         int offset) {
   int idx = 0;
 
   Shape<ndim> ref_strides;
@@ -243,24 +188,24 @@ __device__ inline VectorType<DType, nvec> fast_load_slice(const DType * input,
 }
 
 template <int nvec, typename DType, int ndim>
-__device__ inline void store_index(const VectorType<DType, nvec> value, int i,
+__device__ inline void store_index(const vector::VectorizedStorage<DType, nvec> value, int i,
                         DType * output, const Shape<ndim>& shape) {
   if (i < (shape.size + nvec - 1) / nvec) {
     auto vector_output = reinterpret_cast<
-                          typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(output);
-    vector_output[i] = value.y;
+                          typename vector::VectorizedStorage<DType, nvec>::LType *>(output);
+    vector_output[i] = value.scratch_.aligned;
   }
 }
 
 template <int nvec, typename DType, int ndim>
-__device__ inline void store_add_index(const VectorType<DType, nvec> value, int i,
+__device__ inline void store_add_index(const vector::VectorizedStorage<DType, nvec> value, int i,
                             DType * output, const Shape<ndim>& shape) {
   if (i < (shape.size + nvec - 1) / nvec) {
     auto vector_output = reinterpret_cast<
-                          typename VectorConfig<sizeof(DType)*nvec>::IndexType *>(output);
-    VectorType<DType, nvec> ret(vector_output[i]);
+                          typename vector::VectorizedStorage<DType, nvec>::LType *>(output);
+    vector::VectorizedStorage<DType, nvec> ret(vector_output[i]);
     ret += value;
-    vector_output[i] = ret.y;
+    vector_output[i] = ret.scratch_.aligned;
   }
 }
 
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index 2c05a616347b..99bcb0143303 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -77,6 +77,16 @@ struct VectorType<32> {
   using type = ulonglong4;
 };
 
+template <typename DType>
+__device__ inline DType add_elem(const DType& x, const DType& y) {
+  return x + y;
+}
+
+template <>
+__device__ inline half add_elem(const half& x, const half& y) {
+  return __float2half(__half2float(x) + __half2float(y));
+}
+
 /* \brief Helper class that enables storing multiple values of type DType
           as 1 value of type LType.
 */
@@ -92,6 +102,23 @@ class VectorizedStorage {
     inline __device__ vectorized_storage() {}
     inline __device__ ~vectorized_storage() {}
   } scratch_;
+
+  inline __device__ VectorizedStorage() {}
+  inline __device__ VectorizedStorage (const VectorizedStorage<DType, n>& y2) {
+      scratch_.aligned = y2.scratch_.aligned;
+  }
+  inline __device__ VectorizedStorage (const LType &y2) {
+      scratch_.aligned = y2;
+  }
+  inline __device__ VectorizedStorage<DType, n>& operator+=(
+      const VectorizedStorage<DType, n>& rhs) {
+    #pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      scratch_.separate[i] = add_elem(scratch_.separate[i], rhs.scratch_.separate[i]);
+    }
+    return *this;
+  }
+  inline __device__ ~VectorizedStorage() {}
 };
 
 // Returns const LType is DType is const

From 242aba86cfd20df58ad19962ce5326d1000cd3d4 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 23 Jul 2020 15:46:06 -0700
Subject: [PATCH 55/64] Fix

---
 src/common/cuda/rtc/forward_functions-inl.h |  4 ++--
 src/common/cuda/rtc/vectorization-inl.h     |  8 ++++----
 src/operator/fusion/fused_op.cu             | 10 ++++++----
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 6555ad9f24a8..d3995631a167 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -94,7 +94,7 @@ __device__ inline vector::VectorizedStorage<DType, nvec> load_index(const DType
     const auto* vector_input = reinterpret_cast<const typename V::LType *>(input + i);
     return V(*vector_input);
   } else {
-    return V(0);
+    return V({0});
   }
 }
 
@@ -106,7 +106,7 @@ __device__ inline vector::VectorizedStorage<DType, nvec> global_load_index(const
     const auto* vector_input = reinterpret_cast<const typename V::LType *>(input + i);
     return V(__ldg(vector_input));
   } else {
-    return V(0);
+    return V({0});
   }
 }
 
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index 99bcb0143303..173e580fddd2 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -48,23 +48,23 @@ struct VectorType {
 
 template <>
 struct VectorType<1> {
-  using type = char1;
+  using type = char;
 };
 
 template <>
 struct VectorType<2> {
-  using type = short1;
+  using type = short;
 };
 
 
 template <>
 struct VectorType<4> {
-  using type = uint1;
+  using type = int;
 };
 
 template <>
 struct VectorType<8> {
-  using type = ulonglong1;
+  using type = long long;
 };
 
 template <>
diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu
index 9b6b7d8321d9..3c27e87af2a7 100644
--- a/src/operator/fusion/fused_op.cu
+++ b/src/operator/fusion/fused_op.cu
@@ -332,7 +332,7 @@ std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
   size_t counter = 0;
   for (const auto& entry : g.outputs()) {
     std::string var_name = "output" + std::to_string(counter);
-    code += "op::VectorType<DType_" + var_name + \
+    code += "vector::VectorizedStorage<DType_" + var_name + \
             ", nvec> vec_" + var_name + ";\n";
     ++counter;
   }
@@ -348,7 +348,7 @@ std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
       if (source->is_variable()) {
         if (load_index[i]) {
             code += "const auto " + var_name + " = op::load(vec_" +
-                    variables[{i, 0}] + ".x[j]);\n";
+                    variables[{i, 0}] + ".scratch_.separate[j]);\n";
             CHECK_EQ(outputs[i], 1);
             variables[{i, 0}] = var_name;
         }
@@ -370,7 +370,9 @@ std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
         }
 
         if (fusion::slice_ops.find(op_name) != fusion::slice_ops.end()) {
-          code += "const auto " + var_name + " = op::load(" + variables[{i, 0}] + ".x[j]);\n";
+          code += "const auto " + var_name +
+                  " = op::load(" + variables[{i, 0}] +
+                  ".scratch_.separate[j]);\n";
           variables[{i, 0}] = var_name;
           continue;
         }
@@ -479,7 +481,7 @@ std::string FusedOp::GenerateCode(const std::vector<OpReqType> &req,
   for (const auto& entry : g.outputs()) {
     const std::string& var = variables[{entry.node_id, entry.index}];
     const auto var_name = "output" + std::to_string(counter);
-    code += "vec_" + var_name + ".x[j] = op::store("+ var +", " + var_name + ");\n";
+    code += "vec_" + var_name + ".scratch_.separate[j] = op::store("+ var +", " + var_name + ");\n";
     ++counter;
   }
 

From 57d72a74ac26842093721d9088f9049e15ce66d6 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 28 Jul 2020 12:55:55 -0700
Subject: [PATCH 56/64] Fixes for reduce ops

---
 src/common/cuda/rtc/half-inl.h    |  1 -
 src/common/cuda/rtc/util-inl.h    | 53 +++++++++++++++++++++++++++----
 src/common/utils.cc               |  8 ++---
 src/common/utils.h                |  6 +++-
 src/operator/tensor/reduce_rtc.cc | 14 ++++----
 5 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/src/common/cuda/rtc/half-inl.h b/src/common/cuda/rtc/half-inl.h
index b3306b295637..922bc2f25e45 100644
--- a/src/common/cuda/rtc/half-inl.h
+++ b/src/common/cuda/rtc/half-inl.h
@@ -72,7 +72,6 @@ struct AccType<half> {
     return __float2half(val);
   }
 };
-
 )code";
 
 }  // namespace rtc
diff --git a/src/common/cuda/rtc/util-inl.h b/src/common/cuda/rtc/util-inl.h
index f8eac0463dd0..c779cbf99dde 100644
--- a/src/common/cuda/rtc/util-inl.h
+++ b/src/common/cuda/rtc/util-inl.h
@@ -37,6 +37,52 @@ using uint8 = unsigned char;
 using int8 = char;
 using int32 = int;
 using int64 = long long;
+)code"
+#if MSHADOW_INT64_TENSOR_SIZE == 1
+"typedef int64 index_t;\n"
+#else
+"typedef int32 index_t;\n"
+#endif
+R"code(
+// bool and int8 need to be accumulated in index_t
+template<>
+struct AccType<bool> {
+  using type = index_t;
+
+  __device__ static inline type from(const bool& val) {
+    return val;
+  }
+
+  __device__ static inline bool to(type val) {
+    return val;
+  }
+};
+
+template<>
+struct AccType<int8> {
+  using type = index_t;
+
+  __device__ static inline type from(const int8& val) {
+    return val;
+  }
+
+  __device__ static inline int8 to(type val) {
+    return val;
+  }
+};
+
+template<>
+struct AccType<uint8> {
+  using type = index_t;
+
+  __device__ static inline type from(const uint8& val) {
+    return val;
+  }
+
+  __device__ static inline uint8 to(type val) {
+    return val;
+  }
+};
 
 namespace type_util {
 
@@ -143,12 +189,7 @@ struct mixed_type<U, T, typename enable_if<is_integral<T>::value &&
 };
 
 }  // namespace type_util
-)code"
-#if MSHADOW_INT64_TENSOR_SIZE == 1
-"typedef int64 index_t;\n";
-#else
-"typedef int32 index_t;\n";
-#endif
+)code";
 
 const char util_string[] = R"code(
 enum class OpReqType {
diff --git a/src/common/utils.cc b/src/common/utils.cc
index df5703b39476..67f1f3137c9f 100644
--- a/src/common/utils.cc
+++ b/src/common/utils.cc
@@ -116,17 +116,17 @@ MShadowTypeInfo mshadow_type_info(const int type_flag) {
     case kFloat64:
       return MShadowTypeInfo("float64", sizeof(double));
     case kFloat16:
-      return MShadowTypeInfo("float16", 2);
+      return MShadowTypeInfo("float16", 2, sizeof(float));
     case kUint8:
-      return MShadowTypeInfo("uint8", sizeof(uint8_t));
+      return MShadowTypeInfo("uint8", sizeof(uint8_t), sizeof(index_t));
     case kInt32:
       return MShadowTypeInfo("int32", sizeof(int32_t));
     case kInt8:
-      return MShadowTypeInfo("int8", sizeof(int8_t));
+      return MShadowTypeInfo("int8", sizeof(int8_t), sizeof(index_t));
     case kInt64:
       return MShadowTypeInfo("int64", sizeof(int64_t));
     case kBool:
-      return MShadowTypeInfo("bool", sizeof(bool));
+      return MShadowTypeInfo("bool", sizeof(bool), sizeof(index_t));
     default:
       LOG(FATAL) << "Unknown type flag " << type_flag;
       return MShadowTypeInfo("INVALID", 1);
diff --git a/src/common/utils.h b/src/common/utils.h
index 178c4cdeb28a..557b2b83354a 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -953,9 +953,13 @@ inline int GetDefaultDtype(int dtype) {
 struct MShadowTypeInfo {
   std::string name;
   int size;
+  int acc_size;
+
+  MShadowTypeInfo(const std::string name, const int size, const int acc_size) :
+    name(std::move(name)), size(size), acc_size(acc_size) {}
 
   MShadowTypeInfo(const std::string name, const int size) :
-    name(std::move(name)), size(size) {}
+    MShadowTypeInfo(name, size, size) {}
 };
 
 MShadowTypeInfo mshadow_type_info(const int type_flag);
diff --git a/src/operator/tensor/reduce_rtc.cc b/src/operator/tensor/reduce_rtc.cc
index d7544ffdd943..9d0a23779ab9 100644
--- a/src/operator/tensor/reduce_rtc.cc
+++ b/src/operator/tensor/reduce_rtc.cc
@@ -419,10 +419,9 @@ void RTCReduce(const NodeAttrs& attrs,
   using namespace mxnet::common::cuda::rtc;
   if (req == kNullOp) return;
   Stream<gpu> *s = ctx.get_stream<gpu>();
-  size_t type_size = common::mshadow_type_info(small.type_flag_).size;
-  if (small.type_flag_ == mshadow::kFloat16) {
-    type_size = sizeof(float);
-  }
+  size_t big_type_size = common::mshadow_type_info(big.type_flag_).acc_size;
+  size_t small_type_size = common::mshadow_type_info(small.type_flag_).acc_size;
+  size_t type_size = std::max(big_type_size, small_type_size);
   ReduceImplConfig config(small.shape_, big.shape_, nullptr, nullptr, type_size);
   if (config.M == 1) {
     // With M == 1 result is just (possibly reshaped) OP(big)
@@ -460,10 +459,9 @@ void RTCReduce(const NodeAttrs& attrs,
   using namespace mxnet::common::cuda::rtc;
   if (req == kNullOp) return;
   Stream<gpu> *s = ctx.get_stream<gpu>();
-  size_t type_size = common::mshadow_type_info(small.type_flag_).size;
-  if (small.type_flag_ == mshadow::kFloat16) {
-    type_size = sizeof(float);
-  }
+  size_t big_type_size = common::mshadow_type_info(big.type_flag_).acc_size;
+  size_t small_type_size = common::mshadow_type_info(small.type_flag_).acc_size;
+  size_t type_size = std::max(big_type_size, small_type_size);
   ReduceImplConfig config(small.shape_, big.shape_, &lhs.shape_, &rhs.shape_, type_size);
   std::string common_code = std::string("const OpReqType req = ") +
                             util::to_string(req) +

From e547594c101d5ed1c14e78aa59932e3ad7ab853f Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Sat, 1 Aug 2020 13:32:02 -0700
Subject: [PATCH 57/64] Fix M=1 case

---
 src/operator/tensor/reduce_rtc.cc | 85 ++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 29 deletions(-)

diff --git a/src/operator/tensor/reduce_rtc.cc b/src/operator/tensor/reduce_rtc.cc
index 9d0a23779ab9..64a29cd969e7 100644
--- a/src/operator/tensor/reduce_rtc.cc
+++ b/src/operator/tensor/reduce_rtc.cc
@@ -345,12 +345,18 @@ __global__ void reduce_kernel_M1(const int N,
   for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
     index_t coord[ndim];
     util::unravel(idx, params.small_shape, coord);
-    const index_t idx_big = util::ravel(coord, params.big_shape);
-    const index_t idx_lhs = util::ravel(coord, params.lhs_shape);
-    const index_t idx_rhs = util::ravel(coord, params.rhs_shape);
-    const typename OType::type val =
-      OP1(IType0::from(big[idx_big]), OP2(IType1::from(lhs[idx_lhs]),
-                                          IType2::from(rhs[idx_rhs])));
+    index_t idx_big[1];
+    idx_big[0] = util::ravel(coord, params.big_shape);
+    index_t idx_lhs[1], idx_rhs[1];
+    if (use_input) {
+      idx_lhs[0] = util::ravel(coord, params.lhs_shape);
+      idx_rhs[0] = util::ravel(coord, params.rhs_shape);
+    }
+    typename OType::type val, residual;
+    REDUCER::SetInitValue(val, residual);
+    const int u = 0;
+    REDUCER::Reduce(val, FUNC, residual);
+    REDUCER::Finalize(val, residual);
     if (req == OpReqType::kAddTo) {
       const auto temp = op::add(val, OType::from(small[idx]));
       small[idx] = OType::to(temp);
@@ -362,7 +368,7 @@ __global__ void reduce_kernel_M1(const int N,
 )code";
 
 void RTCReduceM1Impl(Stream<gpu> *s, const TBlob &small, const TBlob &big,
-                     const TBlob &lhs, const TBlob &rhs,
+                     const TBlob *lhs, const TBlob *rhs,
                      const ReduceImplConfig &config, const int ndim,
                      const std::string &common_code, int dev_id) {
   using namespace common::cuda::rtc;
@@ -372,31 +378,52 @@ void RTCReduceM1Impl(Stream<gpu> *s, const TBlob &small, const TBlob &big,
                      common::mshadow_type_info(big.type_flag_).name +
                      ";\n"
                      "using InputType1 = " +
-                     common::mshadow_type_info(lhs.type_flag_).name +
+                     ((lhs != nullptr)
+                     ? common::mshadow_type_info(lhs->type_flag_).name
+                     : "float32") +
                      ";\n"
                      "using InputType2 = " +
-                     common::mshadow_type_info(rhs.type_flag_).name +
+                     ((rhs != nullptr)
+                     ? common::mshadow_type_info(rhs->type_flag_).name
+                     : "float32") +
                      ";\n"
                      "using OutputType0 = " +
                      common::mshadow_type_info(small.type_flag_).name +
                      ";\n";
+  if (lhs != nullptr) {
+    code += "const bool use_input = true;";
+  } else {
+    code += "const bool use_input = false;";
+  }
+
   reduce_kernel_M1_params param {};
   for (int i = 0; i < ndim; ++i) {
     param.big_shape[i] = big.shape_[i];
     param.small_shape[i] = small.shape_[i];
-    param.lhs_shape[i] = lhs.shape_[i];
-    param.rhs_shape[i] = rhs.shape_[i];
+    if (lhs != nullptr) {
+      param.lhs_shape[i] = lhs->shape_[i];
+      param.rhs_shape[i] = rhs->shape_[i];
+    }
   }
 
+  void *null_ptr = nullptr;
   std::vector<const void*> args;
   args.emplace_back(&config.N);
   args.emplace_back(&big.dptr_);
-  args.emplace_back(&lhs.dptr_);
-  args.emplace_back(&rhs.dptr_);
+  if (lhs != nullptr) {
+    args.emplace_back(&(lhs->dptr_));
+    args.emplace_back(&(rhs->dptr_));
+  } else {
+    args.emplace_back(&(null_ptr));
+    args.emplace_back(&(null_ptr));
+  }
   args.emplace_back(&small.dptr_);
   args.emplace_back(&param);
 
-  auto reduce_kernel_M1_func = get_function(code,
+  const auto &function_code = (lhs == nullptr)
+                            ? reduce_function_code
+                            : reduce_function_use_input_code;
+  auto reduce_kernel_M1_func = get_function(code + function_code,
                                             "reduce_kernel_M1",
                                             reduce_kernel_M1_code,
                                             dev_id);
@@ -423,22 +450,22 @@ void RTCReduce(const NodeAttrs& attrs,
   size_t small_type_size = common::mshadow_type_info(small.type_flag_).acc_size;
   size_t type_size = std::max(big_type_size, small_type_size);
   ReduceImplConfig config(small.shape_, big.shape_, nullptr, nullptr, type_size);
+  std::string common_code = std::string("const OpReqType req = ") +
+                            util::to_string(req) +
+                            ";\n"
+                            "#define OP op::" +
+                            OP +
+                            "\n"
+                            "#define REDUCER " +
+                            reducer +
+                            "\n"
+                            "const int ndim = " +
+                            std::to_string(ndim) +
+                            ";\n";
   if (config.M == 1) {
-    // With M == 1 result is just (possibly reshaped) OP(big)
-    UnaryRTCCompute {OP} (attrs, ctx, {big}, {req}, {small});
+    RTCReduceM1Impl(s, small, big, nullptr, nullptr, config,
+                    ndim, common_code, ctx.run_ctx.ctx.dev_id);
   } else {
-    std::string common_code = std::string("const OpReqType req = ") +
-                              util::to_string(req) +
-                              ";\n"
-                              "#define OP op::" +
-                              OP +
-                              "\n"
-                              "#define REDUCER " +
-                              reducer +
-                              "\n"
-                              "const int ndim = " +
-                              std::to_string(ndim) +
-                              ";\n";
     RTCReduceImpl(s, small, req == kAddTo, big, workspace, config,
                   ndim, common_code, ctx.run_ctx.ctx.dev_id);
   }
@@ -479,7 +506,7 @@ void RTCReduce(const NodeAttrs& attrs,
                             std::to_string(ndim) +
                             ";\n";
   if (config.M == 1) {
-    RTCReduceM1Impl(s, small, big, lhs, rhs, config, ndim, common_code, ctx.run_ctx.ctx.dev_id);
+    RTCReduceM1Impl(s, small, big, &lhs, &rhs, config, ndim, common_code, ctx.run_ctx.ctx.dev_id);
   } else {
     RTCReduceImpl(s, small, req == kAddTo, big, workspace, config,
                   ndim, common_code, ctx.run_ctx.ctx.dev_id, &lhs, &rhs);

From 2758622b80b6fcaeba4562c7d818aaf17fdada4b Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 3 Aug 2020 11:26:22 -0700
Subject: [PATCH 58/64] Fixes from rebase Fixes for mixed type gradient
 functions Set the launch bounds on RTC kernels

---
 src/common/cuda/rtc.cc                        |   3 +
 src/common/cuda/rtc/backward_functions-inl.h  | 152 +++++++++++-------
 src/common/cuda/rtc/forward_functions-inl.h   |  81 +++++++---
 src/common/cuda/rtc/reducer-inl.h             |   6 +
 src/common/cuda/rtc/util-inl.h                |   2 +
 src/operator/tensor/broadcast_reduce-inl.h    |   6 +-
 .../tensor/elemwise_binary_broadcast_op.cc    |  10 +-
 src/operator/tensor/elemwise_binary_op.cc     |   3 +
 .../tensor/elemwise_binary_scalar_op.cc       |   2 +
 src/operator/tensor/elemwise_sum.cu           |   1 +
 src/operator/tensor/elemwise_unary_op.cc      |   1 +
 src/operator/tensor/reduce_rtc.cc             |  12 +-
 12 files changed, 193 insertions(+), 86 deletions(-)

diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc
index 5d2596cc9be9..8f3b3391f5e4 100644
--- a/src/common/cuda/rtc.cc
+++ b/src/common/cuda/rtc.cc
@@ -224,7 +224,10 @@ void launch(CUfunction function,
     const_cast<void**>(args->data()),         // arguments
     nullptr);  // );
   if (err != CUDA_SUCCESS) {
+    const char* error_string;
+    cuGetErrorString(err, &error_string);
     LOG(FATAL) << "cuLaunchKernel failed: "
+               << err << " " << error_string << ": "
                << reinterpret_cast<void*>(function) << " "
                << "(" << grid_dim.x << ", " << grid_dim.y << ", " << grid_dim.z << ") "
                << "(" << block_dim.x << ", " << block_dim.y << ", " << block_dim.z << ") "
diff --git a/src/common/cuda/rtc/backward_functions-inl.h b/src/common/cuda/rtc/backward_functions-inl.h
index 52a3b2aa18d2..168dc686e7ad 100644
--- a/src/common/cuda/rtc/backward_functions-inl.h
+++ b/src/common/cuda/rtc/backward_functions-inl.h
@@ -47,26 +47,30 @@ backward_sigmoid(const DTypeGrad grad, const DType out) {
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_softrelu(const DTypeGrad grad, const DType val) {
-  return grad * sigmoid(val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad * sigmoid(v);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_softsign(const DTypeGrad grad, const DType val) {
-  const DType ap1 = 1 + op::abs(val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  const auto ap1 = 1 + op::abs(v);
   return grad / (ap1 * ap1);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_abs(const DTypeGrad grad, const DType val) {
-  return grad * op::sign(val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad * op::sign(v);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_exp(const DTypeGrad grad, const DType val) {
-  return grad * op::exp(val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad * op::exp(v);
 }
 
 template <typename DType, typename DTypeGrad>
@@ -102,13 +106,15 @@ backward_log1p(const DTypeGrad grad, const DType val) {
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_sin(const DTypeGrad grad, const DType val) {
-  return grad * op::cos(val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad * op::cos(v);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_cos(const DTypeGrad grad, const DType val) {
-  return -grad * op::sin(val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return -grad * op::sin(v);
 }
 
 // Uses output from tan
@@ -121,13 +127,15 @@ backward_tan(const DTypeGrad grad, const DType out) {
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_arcsin(const DTypeGrad grad, const DType val) {
-  return grad / op::sqrt(1 - val*val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad / op::sqrt(1 - v*v);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_arccos(const DTypeGrad grad, const DType val) {
-  return -grad / op::sqrt(1 - val*val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return -grad / op::sqrt(1 - v*v);
 }
 
 template <typename DType, typename DTypeGrad>
@@ -151,13 +159,15 @@ backward_radians(const DTypeGrad grad, const DType /* val */) {
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_sinh(const DTypeGrad grad, const DType val) {
-  return grad * op::cosh(val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad * op::cosh(v);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_cosh(const DTypeGrad grad, const DType val) {
-  return grad * op::sinh(val);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad * op::sinh(v);
 }
 
 // Uses tanh output
@@ -170,13 +180,15 @@ backward_tanh(const DTypeGrad grad, const DType out) {
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_arcsinh(const DTypeGrad grad, const DType val) {
-  return grad / op::sqrt(val * val + 1);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad / op::sqrt(v * v + 1);
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_arccosh(const DTypeGrad grad, const DType val) {
-  return grad / op::sqrt(val * val - 1);
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  return grad / op::sqrt(v * v - 1);
 }
 
 template <typename DType, typename DTypeGrad>
@@ -194,7 +206,8 @@ backward_sqrt(const DTypeGrad grad, const DType out) {
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_rsqrt(const DTypeGrad grad, const DType val) {
-  const DType inv = 1 / val;
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  const auto inv = 1 / v;
   return -0.5 * grad * op::sqrt(inv) * inv;
 }
 
@@ -207,7 +220,8 @@ backward_cbrt(const DTypeGrad grad, const DType out) {
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_rcbrt(const DTypeGrad grad, const DType val) {
-  const DType inv = 1 / val;
+  const typename type_util::mixed_type<DTypeGrad, DType>::type v = val;
+  const auto inv = 1 / v;
   return -1.f/3.f * grad * op::cbrt(inv) * inv;
 }
 
@@ -218,15 +232,18 @@ backward_square(const DTypeGrad grad, const DType val) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType rdiv_grad(const DType val,
-                                  const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rdiv_grad(const DType val,
+          const DType2 val2) {
   return -val2 / (val * val);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType div_grad(const DType val,
-                                  const DType2 val2) {
-  return op::reciprocal(val2);
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+div_grad(const DType val,
+         const DType2 val2) {
+  const typename type_util::mixed_type<DType, DType2>::type temp = val2;
+  return op::reciprocal(temp);
 }
 
 template <typename DType, typename DType2>
@@ -266,69 +283,83 @@ __device__ inline DType rmod_grad(const DType val,
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType power_grad(const DType val,
-                                   const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+power_grad(const DType val,
+           const DType2 val2) {
   return op::power(val, val2 - 1.f) * val2;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType power_rgrad(const DType val,
-                                   const DType2 val2) {
-  return op::power(val, val2) * op::log(val);
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+power_rgrad(const DType val,
+            const DType2 val2) {
+  const typename type_util::mixed_type<DType, DType2>::type temp = val;
+  return op::power(val, val2) * op::log(temp);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType rpower_grad(const DType val,
-                                   const DType2 val2) {
-  return val * op::log(val2);
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rpower_grad(const DType val,
+            const DType2 val2) {
+  const typename type_util::mixed_type<DType, DType2>::type temp = val2;
+  return val * op::log(temp);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType hypot_grad_left(const DType val,
-                                        const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+hypot_grad_left(const DType val,
+                const DType2 val2) {
   return val / op::hypot(val, val2);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType hypot_grad_right(const DType val,
-                                         const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+hypot_grad_right(const DType val,
+                 const DType2 val2) {
   return val2 / op::hypot(val, val2);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType copysign_grad(const DType val,
-                                      const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+copysign_grad(const DType val,
+              const DType2 val2) {
   return (val >= 0 && val2 >= 0) || (val < 0 && val2 < 0) ? 1 : -1;
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType arctan2_grad(const DType val,
-                                     const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+arctan2_grad(const DType val,
+             const DType2 val2) {
   return val2 / (val * val + val2 * val2);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType rarctan2_grad(const DType val,
-                                      const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rarctan2_grad(const DType val,
+              const DType2 val2) {
   return val / (val * val + val2 * val2);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType arctan2_rgrad(const DType val,
-                                      const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+arctan2_rgrad(const DType val,
+              const DType2 val2) {
   return -rarctan2_grad(val, val2);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType ldexp_grad(const DType val,
-                                   const DType2 val2) {
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+ldexp_grad(const DType val,
+           const DType2 val2) {
   return op::power(static_cast<DType>(2), val2);
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType rldexp_grad(const DType val,
-                                    const DType2 val2) {
-  returni val2 * op::power(static_cast<DType>(2), val) * op::log(static_cast<DType>(2));
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+rldexp_grad(const DType val,
+            const DType2 val2) {
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  return val2 * op::power(static_cast<mixed_type>(2), val) * op::log(static_cast<mixed_type>(2));
 }
 
 template <typename DType, typename DTypeGrad>
@@ -351,42 +382,55 @@ backward_reciprocal(const DTypeGrad grad, const DType val) {
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_erf(const DTypeGrad grad, const DType val) {
-  return 2.0f / op::sqrt(pi) * op::exp(-(val*val)) * grad;
+  using mixed_type = typename type_util::mixed_type<DTypeGrad, DType>::type;
+  const mixed_type v = val;
+  constexpr mixed_type my_pi = pi;
+  return 2.0f / op::sqrt(my_pi) * op::exp(-(v*v)) * grad;
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_erfinv(const DTypeGrad grad, const DType val) {
-  return 0.5f * op::sqrt(pi) * op::exp(val * val) * grad;
+  using mixed_type = typename type_util::mixed_type<DTypeGrad, DType>::type;
+  constexpr mixed_type my_pi = pi;
+  const mixed_type g = grad;
+  const mixed_type v = val;
+  return 0.5f * op::sqrt(my_pi) * op::exp(v * v) * g;
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_gamma(const DTypeGrad grad, const DType val) {
+  using mixed_type = typename type_util::mixed_type<DTypeGrad, DType>::type;
+  const mixed_type v = val;
   if (type_util::is_same<DTypeGrad, double>::value) {
-    return grad * op::gamma(val) * op::special_functions::cephes::psi<double>(val);
+    return grad * op::gamma(v) * op::special_functions::cephes::psi<double>(v);
   } else {
-    return grad * op::gamma(val) * op::special_functions::cephes::psi<float>(val);
+    return grad * op::gamma(v) * op::special_functions::cephes::psi<float>(v);
   }
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_gammaln(const DTypeGrad grad, const DType val) {
+  using mixed_type = typename type_util::mixed_type<DTypeGrad, DType>::type;
+  const mixed_type v = val;
   if (type_util::is_same<DTypeGrad, double>::value) {
-    return grad * op::special_functions::cephes::psi<double>(val);
+    return grad * op::special_functions::cephes::psi<double>(v);
   } else {
-    return grad * op::special_functions::cephes::psi<float>(val);
+    return grad * op::special_functions::cephes::psi<float>(v);
   }
 }
 
 template <typename DType, typename DTypeGrad>
 __device__ inline typename type_util::mixed_type<DTypeGrad, DType>::type
 backward_digamma(const DTypeGrad grad, const DType val) {
+  using mixed_type = typename type_util::mixed_type<DTypeGrad, DType>::type;
+  const mixed_type v = val;
   if (type_util::is_same<DTypeGrad, double>::value) {
-    return grad * op::special_functions::trigamma<double>(val);
+    return grad * op::special_functions::trigamma<double>(v);
   } else {
-    return grad * op::special_functions::trigamma<float>(val);
+    return grad * op::special_functions::trigamma<float>(v);
   }
 }
 
@@ -411,8 +455,8 @@ __device__ inline DType smooth_l1_grad(const DType val, const DType2 scalar) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType xelu_grad(const DType val,
-                                  const DType2 val2) {
+__device__ inline DType2 xelu_grad(const DType val,
+                                   const DType2 val2) {
   return (val > 0) ? 1 : val2;
 }
 
diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index d3995631a167..0ca1d96f2c76 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -356,78 +356,110 @@ rfmod(const DType a, const DType2 b) {
 
 template <typename DType, typename DType2>
 __device__ inline DType equal(const DType a, const DType2 b) {
-  return a == static_cast<DType>(b) ? 1 : 0;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a == real_b ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType not_equal(const DType a, const DType2 b) {
-  return a != static_cast<DType>(b) ? 1 : 0;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a != real_b ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType greater(const DType a, const DType2 b) {
-  return a > static_cast<DType>(b) ? 1 : 0;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a > real_b ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType greater_equal(const DType a, const DType2 b) {
-  return a >= static_cast<DType>(b) ? 1 : 0;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a >= real_b ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType less(const DType a, const DType2 b) {
-  return a < static_cast<DType>(b) ? 1 : 0;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a < real_b ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType less_equal(const DType a, const DType2 b) {
-  return a <= static_cast<DType>(b) ? 1 : 0;
+  return real_a <= real_b ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
 __device__ inline bool np_equal(const DType a, const DType2 b) {
-  return a == static_cast<DType>(b) ? true : false;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a == real_b ? true : false;
 }
 
 template <typename DType, typename DType2>
 __device__ inline bool np_not_equal(const DType a, const DType2 b) {
-  return a != static_cast<DType>(b) ? true : false;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a != real_b ? true : false;
 }
 
 template <typename DType, typename DType2>
 __device__ inline bool np_greater(const DType a, const DType2 b) {
-  return a > static_cast<DType>(b) ? true : false;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a > real_b ? true : false;
 }
 
 template <typename DType, typename DType2>
 __device__ inline bool np_greater_equal(const DType a, const DType2 b) {
-  return a >= static_cast<DType>(b) ? true : false;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a >= real_b ? true : false;
 }
 
 template <typename DType, typename DType2>
 __device__ inline bool np_less(const DType a, const DType2 b) {
-  return a < static_cast<DType>(b) ? true : false;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a < real_b ? true : false;
 }
 
 template <typename DType, typename DType2>
 __device__ inline bool np_less_equal(const DType a, const DType2 b) {
-  return a <= static_cast<DType>(b) ? true : false;
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a <= real_b ? true : false;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType logical_and(const DType a, const DType2 b) {
-  return a && static_cast<DType>(b) ? 1 : 0;
+  return a && b ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType logical_or(const DType a, const DType2 b) {
-  return a || static_cast<DType>(b) ? 1 : 0;
+  return a || b ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
 __device__ inline DType logical_xor(const DType a, const DType2 b) {
-  const DType bb = static_cast<DType>(b);
-  return ((a || bb) && !(a && bb)) ? 1 : 0;
+  return ((a || b) && !(a && b)) ? 1 : 0;
 }
 
 template <typename DType, typename DType2>
@@ -436,7 +468,7 @@ __device__ inline DType copysign(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline DType rcopysign(const DType a, const DType2 b) {
+__device__ inline DType2 rcopysign(const DType a, const DType2 b) {
   return copysign(b, a);
 }
 
@@ -484,19 +516,28 @@ lcm(const DType a, const DType2 b) {
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_xor(const DType a,
                                                                        const DType2 b) {
-  return static_cast<int64>(a) ^ static_cast<int64>(b);
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a ^ real_b;
 }
 
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_or(const DType a,
                                                                        const DType2 b) {
-  return static_cast<int64>(a) | static_cast<int64>(b);
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a | real_b;
 }
 
 template <typename DType, typename DType2>
 __device__ inline typename type_util::mixed_type<DType, DType2>::type bitwise_and(const DType a,
                                                                        const DType2 b) {
-  return static_cast<int64>(a) & static_cast<int64>(b);
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
+  return real_a & real_b;
 }
 
 DEFINE_BINARY_MATH_FUNC(arctan2, ::atan2, ::atan2f)
diff --git a/src/common/cuda/rtc/reducer-inl.h b/src/common/cuda/rtc/reducer-inl.h
index cb9e9a35c9b0..93b702788c46 100644
--- a/src/common/cuda/rtc/reducer-inl.h
+++ b/src/common/cuda/rtc/reducer-inl.h
@@ -72,6 +72,12 @@ struct sum {
       dst_residual = t2 - (dst_val - t1);
     }
   }
+  /*! \brief finalize reduction result */
+  template<typename DType>
+  __device__ inline static void Finalize(volatile DType& dst) {}
+  /*! \brief finalize reduction result */
+  template<typename DType>
+  __device__ inline static void Finalize(volatile DType& dst, volatile DType& none) {}
   /*!
    *\brief set the initial value during reduction
    */
diff --git a/src/common/cuda/rtc/util-inl.h b/src/common/cuda/rtc/util-inl.h
index c779cbf99dde..1a6aa39d0a62 100644
--- a/src/common/cuda/rtc/util-inl.h
+++ b/src/common/cuda/rtc/util-inl.h
@@ -199,6 +199,8 @@ enum class OpReqType {
   kAddTo
 };
 
+constexpr int kRTCMaxThreadsPerBlock = 512;
+
 namespace util {
 
 constexpr int MAX_DIM = 5;
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 3b7fec7d60fc..ad3bd2a2bec9 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -734,8 +734,7 @@ void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
 
 #if MXNET_USE_CUDA
 
-void RTCReduce(const NodeAttrs& attrs,
-               const OpContext& ctx,
+void RTCReduce(const OpContext& ctx,
                const TBlob& small,
                const OpReqType req,
                const Tensor<gpu, 1, char>& workspace,
@@ -744,8 +743,7 @@ void RTCReduce(const NodeAttrs& attrs,
                int ndim,
                const std::string& OP);
 
-void RTCReduce(const NodeAttrs& attrs,
-               const OpContext& ctx,
+void RTCReduce(const OpContext& ctx,
                const TBlob& small,
                const OpReqType req,
                const Tensor<gpu, 1, char>& workspace,
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.cc b/src/operator/tensor/elemwise_binary_broadcast_op.cc
index b89ad92a051a..2f9832a173f6 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.cc
@@ -53,6 +53,7 @@ struct binary_broadcast_params {
   index_t size[2];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void binary_broadcast_kernel(
     const binary_broadcast_params param,
     const index_t lead_dim,
@@ -137,6 +138,7 @@ struct binary_broadcast_params {
   index_t size[2];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void single_side_binary_broadcast_kernel(
     const binary_broadcast_params param,
     const index_t lead_dim,
@@ -372,10 +374,10 @@ void BinaryBroadcastRTCBackwardUseNone::operator()(const nnvm::NodeAttrs& attrs,
           ctx.requested[0].get_space_typed<gpu, 1, char>(
               Shape1(workspace_size * sizeof(index_t)), s);
       if (out.shape_.Size() != 0) {
-        broadcast::RTCReduce(attrs, ctx, lhs, req[0],
+        broadcast::RTCReduce(ctx, lhs, req[0],
                              workspace, out,
                              "red::sum", NDim, LOP);
-        broadcast::RTCReduce(attrs, ctx, rhs, req[1],
+        broadcast::RTCReduce(ctx, rhs, req[1],
                              workspace, out,
                              "red::sum", NDim, ROP);
       } else {
@@ -431,12 +433,12 @@ void BinaryBroadcastRTCBackwardUseIn::operator()(const nnvm::NodeAttrs& attrs,
         Tensor<gpu, 1, char> workspace =
             ctx.requested[0].get_space_typed<gpu, 1, char>(Shape1(workspace_size), s);
         if (req[0] != kNullOp) {
-          broadcast::RTCReduce(attrs, ctx, lgrad, req[0], workspace,
+          broadcast::RTCReduce(ctx, lgrad, req[0], workspace,
                                ograd, lhs, rhs, "red::sum", NDim,
                                "mul", LOP);
         }
         if (req[1] != kNullOp) {
-          broadcast::RTCReduce(attrs, ctx, rgrad, req[1], workspace,
+          broadcast::RTCReduce(ctx, rgrad, req[1], workspace,
                                ograd, lhs, rhs, "red::sum", NDim,
                                "mul", ROP);
         }
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
index 12bed7dd09a7..c4fb5e1f5a39 100644
--- a/src/operator/tensor/elemwise_binary_op.cc
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -105,6 +105,7 @@ struct binary_kernel_params {
   void *outputs[2];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void binary_kernel(const binary_kernel_params params,
                               const index_t lead_dim,
                               const index_t other_dim,
@@ -190,6 +191,7 @@ struct binary_kernel_params {
   void *outputs[2];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void binary_kernel_bwd(const binary_kernel_params params,
                                   const index_t lead_dim,
                                   const index_t other_dim,
@@ -312,6 +314,7 @@ struct binary_kernel_params {
   void *outputs[2];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void binary_kernel_bwd(const binary_kernel_params params,
                                   const index_t lead_dim,
                                   const index_t other_dim,
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cc b/src/operator/tensor/elemwise_binary_scalar_op.cc
index bb565e5857b5..f09bf21cceb4 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op.cc
@@ -43,6 +43,7 @@ struct binary_scalar_kernel_params {
   double scalar;
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void binary_scalar_kernel(const binary_scalar_kernel_params params,
                                      const index_t lead_dim,
                                      const index_t other_dim,
@@ -163,6 +164,7 @@ struct binary_scalar_kernel_params {
   double scalar;
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void binary_scalar_kernel_bwd(const binary_scalar_kernel_params params,
                                          const index_t lead_dim,
                                          const index_t other_dim,
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index aaf0e8f4dbb1..acee34fb35bc 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -49,6 +49,7 @@ struct elementwise_sum_params {
   void* outputs[1];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void elementwise_sum_kernel(
     const elementwise_sum_params params,
     const index_t lead_dim,
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
index 0f707704ca18..df51c7ba2d12 100644
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -43,6 +43,7 @@ struct unary_kernel_params {
   void *outputs[1];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void unary_kernel(const unary_kernel_params params,
                              const index_t lead_dim,
                              const index_t other_dim,
diff --git a/src/operator/tensor/reduce_rtc.cc b/src/operator/tensor/reduce_rtc.cc
index 64a29cd969e7..9e2d6d3f2a53 100644
--- a/src/operator/tensor/reduce_rtc.cc
+++ b/src/operator/tensor/reduce_rtc.cc
@@ -71,6 +71,7 @@ struct reduce_kernel_params {
   index_t rhs_shape[util::MAX_DIM];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void reduce_kernel(const int N, const int M, const bool addto,
                               const InputType0* __restrict big,
                               const InputType1* __restrict lhs,
@@ -157,6 +158,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
           __syncthreads();
         }
         if (idx < N && tidy == 0) {
+          REDUCER::Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
           if (addto) {
             small[idx + m0 * N] = OType::to(op::add(OType::from(small[idx + m0 * N]),
                                                     shTile[tidx * 2]));
@@ -166,6 +168,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
         }
       } else {
         if (idx < N) {
+          REDUCER::Finalize(val, residual);
           if (addto) {
             small[idx + m0 * N] = OType::to(op::add(OType::from(small[idx + m0 * N]),
                                                     val));
@@ -180,6 +183,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
 )code";
 
 const char reduce_lines_kernel_code[] = R"code(
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void reduce_lines_kernel(const index_t N, const index_t M,
                                     const index_t small_in_stride,
                                     const OutputType0* __restrict small_in,
@@ -195,6 +199,7 @@ __global__ void reduce_lines_kernel(const index_t N, const index_t M,
     }
 
     if (idx < N) {
+      REDUCER::Finalize(val, residual);
       if (req == OpReqType::kAddTo) {
         small_out[idx] = OType::to(op::add(OType::from(small_out[idx]), val));
       } else {
@@ -332,6 +337,7 @@ struct reduce_kernel_M1_params {
   index_t small_shape[util::MAX_DIM];
 };
 
+__launch_bounds__(kRTCMaxThreadsPerBlock)
 __global__ void reduce_kernel_M1(const int N,
                                  const InputType0* __restrict big,
                                  const InputType1* __restrict lhs,
@@ -434,8 +440,7 @@ void RTCReduceM1Impl(Stream<gpu> *s, const TBlob &small, const TBlob &big,
 
 }  // namespace
 
-void RTCReduce(const NodeAttrs& attrs,
-               const OpContext& ctx,
+void RTCReduce(const OpContext& ctx,
                const TBlob& small,
                const OpReqType req,
                const Tensor<gpu, 1, char>& workspace,
@@ -471,8 +476,7 @@ void RTCReduce(const NodeAttrs& attrs,
   }
 }
 
-void RTCReduce(const NodeAttrs& attrs,
-               const OpContext& ctx,
+void RTCReduce(const OpContext& ctx,
                const TBlob& small,
                const OpReqType req,
                const Tensor<gpu, 1, char>& workspace,

From a60c483b8dfec1af009367f9ff2d4da314b66803 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 3 Aug 2020 16:27:29 -0700
Subject: [PATCH 59/64] Fix

---
 src/common/cuda/rtc/forward_functions-inl.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 0ca1d96f2c76..700276df43ff 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -396,6 +396,9 @@ __device__ inline DType less(const DType a, const DType2 b) {
 
 template <typename DType, typename DType2>
 __device__ inline DType less_equal(const DType a, const DType2 b) {
+  using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
+  const mixed_type real_a = a;
+  const mixed_type real_b = b;
   return real_a <= real_b ? 1 : 0;
 }
 

From f1a2f542c758c5d564f29a696d28ab57ee2c485c Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 4 Aug 2020 09:29:53 -0700
Subject: [PATCH 60/64] Fix tests

---
 src/common/cuda/rtc/forward_functions-inl.h | 24 +++++------
 src/common/cuda/rtc/util-inl.h              | 47 ++++++++++++++++++++-
 2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/src/common/cuda/rtc/forward_functions-inl.h b/src/common/cuda/rtc/forward_functions-inl.h
index 700276df43ff..14ee83cd0759 100644
--- a/src/common/cuda/rtc/forward_functions-inl.h
+++ b/src/common/cuda/rtc/forward_functions-inl.h
@@ -403,7 +403,7 @@ __device__ inline DType less_equal(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline bool np_equal(const DType a, const DType2 b) {
+__device__ inline bool_t np_equal(const DType a, const DType2 b) {
   using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
   const mixed_type real_a = a;
   const mixed_type real_b = b;
@@ -411,7 +411,7 @@ __device__ inline bool np_equal(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline bool np_not_equal(const DType a, const DType2 b) {
+__device__ inline bool_t np_not_equal(const DType a, const DType2 b) {
   using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
   const mixed_type real_a = a;
   const mixed_type real_b = b;
@@ -419,7 +419,7 @@ __device__ inline bool np_not_equal(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline bool np_greater(const DType a, const DType2 b) {
+__device__ inline bool_t np_greater(const DType a, const DType2 b) {
   using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
   const mixed_type real_a = a;
   const mixed_type real_b = b;
@@ -427,7 +427,7 @@ __device__ inline bool np_greater(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline bool np_greater_equal(const DType a, const DType2 b) {
+__device__ inline bool_t np_greater_equal(const DType a, const DType2 b) {
   using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
   const mixed_type real_a = a;
   const mixed_type real_b = b;
@@ -435,7 +435,7 @@ __device__ inline bool np_greater_equal(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline bool np_less(const DType a, const DType2 b) {
+__device__ inline bool_t np_less(const DType a, const DType2 b) {
   using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
   const mixed_type real_a = a;
   const mixed_type real_b = b;
@@ -443,7 +443,7 @@ __device__ inline bool np_less(const DType a, const DType2 b) {
 }
 
 template <typename DType, typename DType2>
-__device__ inline bool np_less_equal(const DType a, const DType2 b) {
+__device__ inline bool_t np_less_equal(const DType a, const DType2 b) {
   using mixed_type = typename type_util::mixed_type<DType, DType2>::type;
   const mixed_type real_a = a;
   const mixed_type real_b = b;
@@ -863,7 +863,7 @@ __device__ inline DType logical_not(const DType val) {
 }
 
 template <typename DType>
-__device__ inline bool np_logical_not(const DType val) {
+__device__ inline bool_t np_logical_not(const DType val) {
   return !static_cast<bool>(val);
 }
 
@@ -873,22 +873,22 @@ __device__ inline bool isnan(const DType val) {
 }
 
 template <typename DType>
-__device__ inline bool isinf(const DType val) {
+__device__ inline bool_t isinf(const DType val) {
   return util::isinf(val);
 }
 
 template <typename DType>
-__device__ inline bool isposinf(const DType val) {
+__device__ inline bool_t isposinf(const DType val) {
   return util::isinf(val) && (val > 0);
 }
 
 template <typename DType>
-__device__ inline bool isneginf(const DType val) {
+__device__ inline bool_t isneginf(const DType val) {
   return util::isinf(val) && (val < 0);
 }
 
 template <typename DType>
-__device__ inline bool isfinite(const DType val) {
+__device__ inline bool_t isfinite(const DType val) {
   return !op::isnan(val) && !op::isinf(val);
 }
 
@@ -896,7 +896,7 @@ __device__ inline bool isfinite(const DType val) {
 
 template <typename DType>
 __device__ inline DType bitwise_not(const DType a) {
-  if (type_util::is_same<DType, bool>::value) {
+  if (type_util::is_same<DType, bool_t>::value) {
     return !a;
   } else {
     return ~static_cast<int64>(a);
diff --git a/src/common/cuda/rtc/util-inl.h b/src/common/cuda/rtc/util-inl.h
index 1a6aa39d0a62..a4779fef0499 100644
--- a/src/common/cuda/rtc/util-inl.h
+++ b/src/common/cuda/rtc/util-inl.h
@@ -45,9 +45,32 @@ using int64 = long long;
 #endif
 R"code(
 // bool and int8 need to be accumulated in index_t
+// but bool needs to be treated in the special way
+// for ops like bitwise_not
+struct bool_t {
+  index_t value;
+
+  __device__ inline bool_t(const index_t& v) : value(v) {}
+  __device__ inline bool_t(const volatile index_t& v) : value(v) {}
+  __device__ inline bool_t() : value(0) {}
+
+  __device__ inline operator index_t() const volatile { return value; }
+  __device__ inline bool_t& operator= (const index_t& v) {
+    value = v;
+    return *this;
+  }
+  __device__ inline volatile bool_t& operator= (const index_t& v) volatile {
+    value = v;
+    return *this;
+  }
+  __device__ inline bool_t& operator= (const volatile index_t& v) {
+    value = v;
+    return *this;
+  }
+};
 template<>
 struct AccType<bool> {
-  using type = index_t;
+  using type = bool_t;
 
   __device__ static inline type from(const bool& val) {
     return val;
@@ -101,11 +124,13 @@ template <> struct is_integral<int8>  : true_type {};
 template <> struct is_integral<int32> : true_type {};
 template <> struct is_integral<int64> : true_type {};
 template <> struct is_integral<bool>  : true_type {};
+template <> struct is_integral<bool_t>  : true_type {};
 
 // is_unsigned
 template <typename T> struct is_unsigned : false_type {};
 template <> struct is_unsigned<uint8> : true_type {};
 template <> struct is_unsigned<bool>  : true_type {};
+template <> struct is_unsigned<bool_t>  : true_type {};
 
 // is_same
 template <typename T, typename U>
@@ -177,6 +202,7 @@ struct mixed_type<float16, T, typename enable_if<is_integral<T>::value>::type> {
 template <typename T, typename U>
 struct mixed_type<T, U, typename enable_if<is_integral<T>::value &&
                                            is_integral<U>::value &&
+                                           !is_same<U, bool_t>::value &&
                                            sizeof(T) <= sizeof(U)>::type> {
   using type = U;
 };
@@ -184,10 +210,29 @@ struct mixed_type<T, U, typename enable_if<is_integral<T>::value &&
 template <typename T, typename U>
 struct mixed_type<U, T, typename enable_if<is_integral<T>::value &&
                                            is_integral<U>::value &&
+                                           !is_same<U, bool_t>::value &&
                                            sizeof(T) < sizeof(U)>::type> {
   using type = U;
 };
 
+template <typename T>
+struct mixed_type<T, bool_t, typename enable_if<is_integral<T>::value &&
+                                                sizeof(T) < sizeof(bool_t)>::type> {
+  using type = index_t;
+};
+
+template <typename T>
+struct mixed_type<bool_t, T, typename enable_if<is_integral<T>::value &&
+                                                sizeof(T) < sizeof(bool_t)>::type> {
+  using type = index_t;
+};
+
+template <typename T>
+struct mixed_type<T, bool_t, typename enable_if<is_integral<T>::value &&
+                                                sizeof(T) == sizeof(bool_t)>::type> {
+  using type = T;
+};
+
 }  // namespace type_util
 )code";
 

From 34c9d0007758fd6eaf35be0bfc97401ed1fb1080 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 7 Aug 2020 13:48:57 -0700
Subject: [PATCH 61/64] Adding tutorial for RTC

---
 .../python/tutorials/extend/index.rst         |   6 +
 .../src/pages/api/faq/add_op_in_backend.md    |   1 +
 .../src/pages/api/faq/using_rtc.md            | 465 ++++++++++++++++++
 src/common/cuda/rtc/vectorization-inl.h       |   5 +-
 4 files changed, 476 insertions(+), 1 deletion(-)
 create mode 100644 docs/static_site/src/pages/api/faq/using_rtc.md

diff --git a/docs/python_docs/python/tutorials/extend/index.rst b/docs/python_docs/python/tutorials/extend/index.rst
index 8cca07f48141..ba708c6c3b6c 100644
--- a/docs/python_docs/python/tutorials/extend/index.rst
+++ b/docs/python_docs/python/tutorials/extend/index.rst
@@ -47,6 +47,12 @@ The following tutorials will help you learn how to customize MXNet.
       How to create new MXNet operators in MXNet's backend using C++.
       An example custom quadratic function op.
 
+   .. card::
+      :title: Using runtime compilation (RTC) to write CUDA kernels in MXNet
+      :link: https://mxnet.apache.org/api/faq/using_rtc
+
+      How to write CUDA kernels in MXNet using runtime compilation.
+
 
 .. toctree::
    :hidden:
diff --git a/docs/static_site/src/pages/api/faq/add_op_in_backend.md b/docs/static_site/src/pages/api/faq/add_op_in_backend.md
index 19e55ec432cf..f8b8a0d8f8b1 100644
--- a/docs/static_site/src/pages/api/faq/add_op_in_backend.md
+++ b/docs/static_site/src/pages/api/faq/add_op_in_backend.md
@@ -721,3 +721,4 @@ and
 
 ## Additional Resources
 - [Use TensorInspector to Help Debug Operators](./tensor_inspector_tutorial)
+- [Use RTC to write CUDA kernels](./using_rtc)
diff --git a/docs/static_site/src/pages/api/faq/using_rtc.md b/docs/static_site/src/pages/api/faq/using_rtc.md
new file mode 100644
index 000000000000..17efdc303fb8
--- /dev/null
+++ b/docs/static_site/src/pages/api/faq/using_rtc.md
@@ -0,0 +1,465 @@
+---
+layout: page_category
+title: Using runtime compilation (RTC) to write CUDA kernels in MXNet
+category: faq
+faq_c: Extend and Contribute to MXNet
+question: How do I implement GPU functions in MXNet using RTC?
+permalink: /api/faq/using_rtc
+---
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Using runtime compilation (RTC) to write CUDA kernels in MXNet
+
+## Introduction
+
+CUDA kernel is a function running on the GPU to perform computation. This tutorial assumes the
+reader has a basic knowledge about how to write such kernels.
+
+There are currently 2 typical ways of writing and launching CUDA kernels in MXNet. The first one is
+to use the `Kernel<...>::Launch()` API, which is suitable for simple elementwise operations and
+enables writing only portion of the kernel, leaving the launch mechanism to MXNet. The
+other one is to write a kernel from scratch and launch it using the `<<<...>>>` method from CUDA.
+Starting from MXNet 2.0, there is a third option - runtime compilation (RTC). This differs from the
+previous methods (which use kernels compiled ahead of time), as it compiles the needed kernels
+during runtime of the user script.
+
+In this tutorial we will cover the reasons for using RTC instead of the other methods, show how to
+do it, as well as tips on what to keep in mind when doing it.
+
+# Why RTC?
+
+## Problems with kernels compiled ahead of time
+
+The use of kernels compiled ahead of time in MXNet leads to a few problems, which unfortunately
+are mostly invisible in any single PR, but grow over the course of many contributions and result in
+serious issues.
+
+In order to understand them, let us look at the typical way kernels are launched in MXNet. This
+example shows a launch of the simple kernel, taking a single input of type `DType` and producing
+single output of type `OType`:
+
+```cpp
+MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+    Kernel<...>::Launch(s, inputs[0].dptr<DType>(), outputs[0].dptr<OType>());
+  });
+});
+```
+
+This launch mechanism uses the `MSHADOW_TYPE_SWITCH` macro, which produces a version of the kernel
+for every possible type. In the case of nested usage (as is the case in the example shown) it
+produces a version of the kernel for every combination of types. This results in a large number of
+kernels being generated.
+
+Another factor that multiplies the number of kernels is that different GPU architectures require
+different compiled binaries. Therefore for MXNet to support all of them with a single binary, that
+binary needs to contain copies of those kernels for each architecure.
+
+This proliferation of CUDA kernels in the binary leads to multiple issues. The first problem is the
+size of the MXNet library - each compiled version of the kernel takes some space in the binary,
+which is small but multiplied by the number of all versions (which could reach thousands per
+GPU architecture) and GPU architectures. This increase in size led to multiple issues reported with
+distribution of the MXNet package,
+[building the library](https://github.com/apache/incubator-mxnet/issues/17045) as well as
+[limiting the number of architectures natively
+supported](https://github.com/apache/incubator-mxnet/pull/18205).
+
+The second issue is the "idle" memory consumption of the MXNet library. In order to efficiently
+launch kernels when they are called, CUDA driver needs to transfer them to the GPU memory ahead of
+time. Since it cannot anticipate which kernels will actually be used, all of the kernels are
+transferred when the CUDA context is created on a GPU. This means that, even if a user never uses
+e.g. kernel which adds `int8` and `float16` tensors, that kernel still occupies memory on their GPU,
+reducing the amount of memory available for useful work.
+
+The third issue, mostly affecting MXNet developers, is the compilation time of the MXNet library.
+The more kernels versions need to be compiled, the more time and hardware resources is needed.
+
+## RTC to the rescue!
+
+All of the issues mentioned in the previous paragraph are solved when using runtime compilation.
+Using this paradigm, only the kernels actually invoked in the user script are compiled. They do not
+occupy space in the MXNet binary and there is no unused kernels stored in users' GPU memory.
+
+RTC also enables more features:
+
+ - using more information about specific usage of the kernel when compiling it (e.g. using shape
+   information of the inputs) to optimize it better
+ - writing kernels accepting any combinations of input and output types
+ - (in the future) fusing more operations into the generated kernels.
+
+# RTC for kernel developers
+
+## Example: unary operators
+
+Let us start with an example of the simple kernel written using RTC: a kernel which performs unary
+operation (with a concrete example of sigmoid) on its input. It is not a toy example though: it is
+a fully generic kernel, capable of operating on any combination of input and output types, as well
+as applying any unary operator:
+
+```cpp
+struct UnaryRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
+const char unary_kernel_fwd[] = R"code(
+
+__launch_bounds__(kRTCMaxThreadsPerBlock)
+__global__ void unary_kernel(const InputType* input,
+                             const OutputType* output,
+                             const index_t N) {
+  using IType = AccType<InputType>;
+  using OType = AccType<OutputType>;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < N;
+       tid += gridDim.x * blockDim.x) {
+    const auto input = IType::from(input[i]);
+    const auto temp = OP(input);  // enables returning different type
+
+    if (req == OpReqType::kAddTo) {
+      // temp2 may have a wider type than either temp
+      // or OType
+      const auto temp2 = op::add(temp, OType::from(output[i]));
+      output[i] = OType::to(temp2);
+    } else {
+      output[i] = OType::to(temp);
+    }
+  }
+}
+
+)code";
+
+void UnaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                                 const OpContext& ctx,
+                                 const std::vector<TBlob>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n"
+                           "#define OP op::" +
+                           OP +
+                           "\n"
+                           "using InputType = " +
+                           common::mshadow_type_info(inputs[0].type_flag_).name +
+                           ";\n"
+                           "using OutputType = " +
+                           common::mshadow_type_info(outputs[0].type_flag_).name +
+                           ";\n";
+
+  std::vector<const void*> args;
+  const index_t size = outputs[0].Size();
+  args.emplace_back(&(inputs[0].dptr_));
+  args.emplace_back(&(outputs[0].dptr_));
+  args.emplace_back(&size);
+
+  auto kernel = get_function(code, "unary_kernel", unary_kernel_fwd,
+                             ctx.run_ctx.get_ctx().dev_id);
+
+  const int n_threads = 512;
+  const index_t n_blocks = (size + n_threads - 1) / n_threads;
+  const int shared_memory_size = 0;
+  launch(kernel, {n_blocks, 1, 1}, {512, 1, 1},
+         shared_memory_size, s, &args);
+}
+
+NNVM_REGISTER_OP(sigmoid)
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sigmoid"});
+```
+
+## Kernels are text...
+
+The main difference when writing kernels using RTC is that the kernel code becomes the text string.
+This means that it is possible to change or compose the code at runtime, as is done here:
+
+```cpp
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n"
+                           "#define OP op::" +
+                           OP +
+                           "\n"
+                           "using InputType = " +
+                           common::mshadow_type_info(inputs[0].type_flag_).name +
+                           ";\n"
+                           "using OutputType = " +
+                           common::mshadow_type_info(outputs[0].type_flag_).name +
+                           ";\n";
+```
+
+where the operation `OP` is also provided as a string in the operator declaration:
+
+```cpp
+NNVM_REGISTER_OP(sigmoid)
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sigmoid"});
+```
+
+## and do not know MXNet source code
+
+How does the kernel know what operation it should perform? The kernel's source code uses `OP`,
+which shows up in the `code` variable and is equal to `op::sigmoid`. Let us compare this to how the
+same operator is defined for CPU:
+
+```cpp
+MXNET_OPERATOR_REGISTER_UNARY(sigmoid)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::sigmoid>)
+```
+
+Since the kernel is compiled at runtime, it does not have access to the rest of the MXNet source
+code, including `mshadow_op.h`, which defined `mshadow_op::sigmoid`. This means that we need to
+provide the kernel with definitions of those functions (again, in text string form). Every
+RTC-compiled kernel is prepended with a common header, containing string found in
+`src/common/cuda/rtc/` directory. The `src/common/cuda/rtc/forward_functions-inl.h` file contains
+the definition of `op::sigmoid`:
+
+```cpp
+template <typename DType>
+__device__ inline DType sigmoid(const DType val) {
+  if (type_util::has_double_or_integral<DType>::value) {
+    return 1./(1 + ::exp(-val));
+  } else {
+    return 1.f/(1 + expf(-val));
+  }
+}
+```
+
+## Handling of data types
+
+MXNet has support for many datatypes. Some of those datatypes, like `float16`, `int8` or `bool` are
+useful when storing the results, but in many computations they are too limiting as they can easily
+overflow in the intermediate stages. That is why in the example we use `AccType<T>` class - it
+provides an accumulation type, that is potentially larger than the storage type - for example,
+`AccType<float16>::type` is `float32`. It also provides special loading and storing functions:
+`AccType<T>::from()` and `AccType<T>::to()`.
+
+One of the features of RTC-enabled kernels is to be able to accommodate any combination of the
+input and output datatypes. Using `auto` as the output type of the intermediate steps helps with,
+especially since many binary operators return a mixed type:
+
+```cpp
+template <typename DType, typename DType2>
+__device__ inline typename type_util::mixed_type<DType, DType2>::type
+add(const DType a, const DType2 b) {
+  return a + b;
+}
+```
+
+`mixed_type<T, U>::type` is a type capable of storing value of the operation between 2 types `T` and
+`U` - e.g. `mixed_type<float64, float32>::type = float64` and `mixed_type<float32, int32>::type =
+float32`.
+
+## Compiling and launching RTC kernels
+
+The kernel code stored in `unary_kernel_fwd` is generic and relies on multiple names to be defined,
+like `req`, `OP` or `InputType`. This is handled in the specific operator using the kernel by
+defining a set of parameters that will be concatenated to the code during compilation:
+
+```cpp
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n"
+                           "#define OP op::" +
+                           OP +
+                           "\n"
+                           "using InputType = " +
+                           common::mshadow_type_info(inputs[0].type_flag_).name +
+                           ";\n"
+                           "using OutputType = " +
+                           common::mshadow_type_info(outputs[0].type_flag_).name +
+                           ";\n";
+```
+
+In order to compile the kernel, the `mxnet::common::cuda::rtc::get_function` method is used:
+
+```cpp
+  auto kernel = get_function(code, "unary_kernel", unary_kernel_fwd,
+                             ctx.run_ctx.get_ctx().dev_id);
+```
+
+In order to eliminate overheads coming from the compilation, it uses cache of kernels, with a key
+being the name of the kernel (`"unary_kernel"` in our case) and the set of parameters (`code` in our
+case). If the kernel is already in cache, it is returned, otherwise compilation takes place. If it
+fails, the full source code is saved to disk and the MXNet error with the compilation log is
+generated.
+
+To launch the kernel, the `mxnet::common::cuda::rtc::launch` method is used:
+
+```cpp
+  launch(kernel, {n_blocks, 1, 1}, {512, 1, 1},
+         shared_memory_size, s, &args);
+```
+
+It takes the kernel object, grid and block dimensions, size of dynamic shared memory, stream and
+kernel parameters.
+
+# Other features enabled by RTC
+
+## Vectorization
+
+The actual kernel used for application of unary operator in MXNet looks slightly different compared
+to the simple example shown in the previous paragraph. Differences come from using vectorization.
+This means, that instead of reading (or writing) 1 element at a time, kernel instead accesses
+multiple array elements at once. This is beneficial, especially when dealing with smaller
+types like `float16` or `int8`. Accessing those small types one by one is inefficient and does not
+saturate the memory bandwidth of the GPU, so using vector accesses improves achieved memory
+bandwidth.
+
+```cpp
+
+// excerpt from src/operator/tensor/elemwise_unary_op.h
+struct UnaryRTCCompute {
+  std::string OP;
+
+  void operator()(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs);
+};
+
+// excerpt from src/operator/tensor/elemwise_unary_op.cc
+struct unary_kernel_params {
+  const void *inputs[1];
+  void *outputs[1];
+};
+
+const char unary_kernel_fwd[] = R"code(
+
+struct unary_kernel_params {
+  const void *inputs[1];
+  void *outputs[1];
+};
+
+__launch_bounds__(kRTCMaxThreadsPerBlock)
+__global__ void unary_kernel(const unary_kernel_params params,
+                             const index_t lead_dim,
+                             const index_t other_dim,
+                             const index_t N,
+                             const index_t num_aligned_elements) {
+  using namespace vector;
+  VectorizedLoader<InputType0, nvec, aligned> loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+
+  using IType = AccType<InputType0>;
+  using OType = AccType<OutputType0>;
+
+  const index_t M = num_aligned_elements;
+
+  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+       tid < M;
+       tid += gridDim.x * blockDim.x) {
+    loader.load(tid, N);
+    if (req == OpReqType::kAddTo) {
+      storer.load(tid, N);
+    }
+#pragma unroll
+    for (int i = 0; i < nvec; ++i) {
+      const auto input = IType::from(loader.separate()[i]);
+      const auto temp = OP(input);  // enables returning different type
+
+      if (req == OpReqType::kAddTo) {
+        // temp2 may have a wider type than either temp
+        // or OType
+        const auto temp2 = op::add(temp, OType::from(storer.separate()[i]));
+        storer.separate()[i] = OType::to(temp2);
+      } else {
+        storer.separate()[i] = OType::to(temp);
+      }
+    }
+    storer.store(tid, N);
+  }
+}
+
+)code";
+
+void UnaryRTCCompute::operator()(const nnvm::NodeAttrs& attrs,
+                                 const OpContext& ctx,
+                                 const std::vector<TBlob>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<TBlob>& outputs) {
+  using namespace mxnet::common::cuda::rtc;
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const std::string code = std::string("const OpReqType req = ") +
+                           util::to_string(req[0]) +
+                           ";\n"
+                           "#define OP op::" +
+                           OP +
+                           "\n";
+  const int nvec = outputs[0].type_flag_ == mshadow::kFloat64 ? 2 : 4;
+
+  const index_t size = outputs[0].Size();
+  unary_kernel_params params = { {inputs[0].dptr_},
+                                 {outputs[0].dptr_} };
+
+  VectorizedKernelRTCLauncher(code, "unary_kernel",
+                              unary_kernel_fwd, nvec,
+                              size, 1, s, params,
+                              inputs, outputs,
+                              ctx.run_ctx.get_ctx().dev_id);
+}
+
+// excerpt from src/operator/tensor/elemwise_unary_op_basic.cu
+NNVM_REGISTER_OP(sigmoid)
+.set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sigmoid"});
+```
+
+RTC implementation in MXNet provides a few useful helper functions and classes, which simplify the
+process of writing and launching kernels using vectorization. For accessing the memory using
+vectorization, 2 classes are provided, used in this kernel to access input and output array:
+
+```cpp
+  VectorizedLoader<InputType0, nvec, aligned> loader(
+    reinterpret_cast<const InputType0*>(params.inputs[0]), N);
+  VectorizedStorer<OutputType0, nvec, aligned> storer(
+    reinterpret_cast<OutputType0*>(params.outputs[0]), N);
+```
+
+The `loader` object accesses `params.inputs[0]` pointer to array of N elements having type
+`InputType0` (which is the name assigned to the type of the first input by the
+`VectorizedKernelRTCLauncher`, which is the helper launcher function). It loads `nvec` elements at
+a time and has additional `aligned` option, which is also set by the `VectorizedKernelRTCLauncher`.
+Similarly `storer` object is used to write data of type `OutputType0` to `params.outputs[0]`.
+
+The kernel using `VectorizedKernelRTCLauncher` needs to have specific parameters:
+
+```cpp
+__global__ void unary_kernel(const unary_kernel_params params,      // kernel-specific parameters
+                             const index_t lead_dim,                // lead dimension of the tensor
+                             const index_t other_dim,               // size of the other dimensions
+                             const index_t N,                       // total number of elements
+                             const index_t num_aligned_elements) {  // number of vector elements in
+                                                                    // lead dimension
+```
diff --git a/src/common/cuda/rtc/vectorization-inl.h b/src/common/cuda/rtc/vectorization-inl.h
index 173e580fddd2..9868069daf73 100644
--- a/src/common/cuda/rtc/vectorization-inl.h
+++ b/src/common/cuda/rtc/vectorization-inl.h
@@ -279,10 +279,13 @@ inline int CalcAlignment(const void *ptr, const int size) {
   return ptr_as_number % size;
 }
 
-/* \brief Check alignment of the inputs and outputs when cast to LType*.
+/* \brief Check alignment of the inputs and outputs when using vectorized accesses.
    \param params Structure containing arrays with inputs' and outputs' pointers
    \param lead_dim Leading dimension of the tensors.
    \param other_dim The size of the other dimensions of the tensors.
+   \param nvec Length of the vector.
+   \param inputs Inputs to the operator.
+   \param outputs Outputs of the operator.
 */
 template <typename Params>
 Alignment CheckAlignment(const Params& params, const index_t lead_dim,

From f4e039ce68791c9a1501d646792c237d9a0d68c8 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 13 Aug 2020 12:24:46 -0700
Subject: [PATCH 62/64] Fixes after merge

---
 src/operator/numpy/random/np_gamma_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/numpy/random/np_gamma_op.h b/src/operator/numpy/random/np_gamma_op.h
index e1d031fb9a0b..57d46ff5cf51 100644
--- a/src/operator/numpy/random/np_gamma_op.h
+++ b/src/operator/numpy/random/np_gamma_op.h
@@ -420,7 +420,7 @@ inline void GammaReparamBackwardImpl(const OpContext& ctx,
   const TBlob alpha = inputs[1].reshape(new_ishape);
   TBlob samples = inputs[2].reshape(new_oshape);
   size_t workspace_size =
-      ReduceWorkspaceSize<ndim, DType>(s, igrad.shape_, req[0], ograd.shape_);
+      ReduceWorkspaceSize(s, igrad.shape_, req[0], ograd.shape_, sizeof(DType));
   // Convert samples to standard gamma
   Kernel<op_with_req<mshadow_op::div, kWriteTo>, xpu>::Launch(
     s, samples.Size(), samples.dptr<DType>(), samples.dptr<DType>(), DType(scale));

From 12c8356c0ec282cfa686eb214f35c8e9804c5f19 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 14 Aug 2020 11:25:04 -0700
Subject: [PATCH 63/64] Fixes from review

---
 .../python/tutorials/extend/index.rst         |  11 +-
 .../src/pages/api/faq/using_rtc.md            |  22 ++--
 src/common/cuda/rtc/util-inl.h                |   9 ++
 tests/python/unittest/test_numpy_op.py        |  81 ++++++++++++-
 tests/python/unittest/test_operator.py        | 106 +++++++++---------
 5 files changed, 156 insertions(+), 73 deletions(-)

diff --git a/docs/python_docs/python/tutorials/extend/index.rst b/docs/python_docs/python/tutorials/extend/index.rst
index ba708c6c3b6c..fd09a053fc22 100644
--- a/docs/python_docs/python/tutorials/extend/index.rst
+++ b/docs/python_docs/python/tutorials/extend/index.rst
@@ -36,20 +36,20 @@ The following tutorials will help you learn how to customize MXNet.
 
    .. card::
       :title: New Operator Creation
-      :link: https://mxnet.apache.org/api/faq/new_op
+      :link: /api/faq/new_op
 
       How to create new MXNet operators using CustomOp (Python) or NNVM (C++).
 
    .. card::
       :title: A Beginner’s Guide to Implementing Operators in MXNet Backend
-      :link: https://mxnet.apache.org/api/faq/add_op_in_backend
+      :link: /api/faq/add_op_in_backend
 
       How to create new MXNet operators in MXNet's backend using C++.
       An example custom quadratic function op.
 
    .. card::
       :title: Using runtime compilation (RTC) to write CUDA kernels in MXNet
-      :link: https://mxnet.apache.org/api/faq/using_rtc
+      :link: /api/faq/using_rtc
 
       How to write CUDA kernels in MXNet using runtime compilation.
 
@@ -59,5 +59,6 @@ The following tutorials will help you learn how to customize MXNet.
    :glob:
 
    *
-   New Operator Creation <https://mxnet.apache.org/api/faq/new_op>
-   New Operator in MXNet Backend <https://mxnet.apache.org/api/faq/add_op_in_backend>
+   New Operator Creation </api/faq/new_op>
+   New Operator in MXNet Backend </api/faq/add_op_in_backend>
+   Using RTC for CUDA kernels </api/faq/using_rtc>
diff --git a/docs/static_site/src/pages/api/faq/using_rtc.md b/docs/static_site/src/pages/api/faq/using_rtc.md
index 17efdc303fb8..6a772ee3c7f9 100644
--- a/docs/static_site/src/pages/api/faq/using_rtc.md
+++ b/docs/static_site/src/pages/api/faq/using_rtc.md
@@ -41,9 +41,9 @@ during runtime of the user script.
 In this tutorial we will cover the reasons for using RTC instead of the other methods, show how to
 do it, as well as tips on what to keep in mind when doing it.
 
-# Why RTC?
+## Why RTC?
 
-## Problems with kernels compiled ahead of time
+### Problems with kernels compiled ahead of time
 
 The use of kernels compiled ahead of time in MXNet leads to a few problems, which unfortunately
 are mostly invisible in any single PR, but grow over the course of many contributions and result in
@@ -89,7 +89,7 @@ reducing the amount of memory available for useful work.
 The third issue, mostly affecting MXNet developers, is the compilation time of the MXNet library.
 The more kernels versions need to be compiled, the more time and hardware resources is needed.
 
-## RTC to the rescue!
+### RTC to the rescue!
 
 All of the issues mentioned in the previous paragraph are solved when using runtime compilation.
 Using this paradigm, only the kernels actually invoked in the user script are compiled. They do not
@@ -102,9 +102,9 @@ RTC also enables more features:
  - writing kernels accepting any combinations of input and output types
  - (in the future) fusing more operations into the generated kernels.
 
-# RTC for kernel developers
+## RTC for kernel developers
 
-## Example: unary operators
+### Example: unary operators
 
 Let us start with an example of the simple kernel written using RTC: a kernel which performs unary
 operation (with a concrete example of sigmoid) on its input. It is not a toy example though: it is
@@ -194,7 +194,7 @@ NNVM_REGISTER_OP(sigmoid)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sigmoid"});
 ```
 
-## Kernels are text...
+### Kernels are text...
 
 The main difference when writing kernels using RTC is that the kernel code becomes the text string.
 This means that it is possible to change or compose the code at runtime, as is done here:
@@ -221,7 +221,7 @@ NNVM_REGISTER_OP(sigmoid)
 .set_attr<FCompute>("FCompute<gpu>", UnaryRTCCompute{"sigmoid"});
 ```
 
-## and do not know MXNet source code
+### and do not know MXNet source code
 
 How does the kernel know what operation it should perform? The kernel's source code uses `OP`,
 which shows up in the `code` variable and is equal to `op::sigmoid`. Let us compare this to how the
@@ -250,7 +250,7 @@ __device__ inline DType sigmoid(const DType val) {
 }
 ```
 
-## Handling of data types
+### Handling of data types
 
 MXNet has support for many datatypes. Some of those datatypes, like `float16`, `int8` or `bool` are
 useful when storing the results, but in many computations they are too limiting as they can easily
@@ -275,7 +275,7 @@ add(const DType a, const DType2 b) {
 `U` - e.g. `mixed_type<float64, float32>::type = float64` and `mixed_type<float32, int32>::type =
 float32`.
 
-## Compiling and launching RTC kernels
+### Compiling and launching RTC kernels
 
 The kernel code stored in `unary_kernel_fwd` is generic and relies on multiple names to be defined,
 like `req`, `OP` or `InputType`. This is handled in the specific operator using the kernel by
@@ -319,9 +319,9 @@ To launch the kernel, the `mxnet::common::cuda::rtc::launch` method is used:
 It takes the kernel object, grid and block dimensions, size of dynamic shared memory, stream and
 kernel parameters.
 
-# Other features enabled by RTC
+## Other features enabled by RTC
 
-## Vectorization
+### Vectorization
 
 The actual kernel used for application of unary operator in MXNet looks slightly different compared
 to the simple example shown in the previous paragraph. Differences come from using vectorization.
diff --git a/src/common/cuda/rtc/util-inl.h b/src/common/cuda/rtc/util-inl.h
index a4779fef0499..372390fdc117 100644
--- a/src/common/cuda/rtc/util-inl.h
+++ b/src/common/cuda/rtc/util-inl.h
@@ -37,6 +37,15 @@ using uint8 = unsigned char;
 using int8 = char;
 using int32 = int;
 using int64 = long long;
+
+static_assert(sizeof(float32) == 4, "Size of float32 is expected to be 4B");
+static_assert(sizeof(float64) == 8, "Size of float64 is expected to be 8B");
+static_assert(sizeof(float16) == 2, "Size of float16 is expected to be 2B");
+static_assert(sizeof(uint8) == 1, "Size of uint8 is expected to be 1B");
+static_assert(sizeof(int8) == 1, "Size of int8 is expected to be 1B");
+static_assert(sizeof(int32) == 4, "Size of int32 is expected to be 4B");
+static_assert(sizeof(int64) == 8, "Size of int64 is expected to be 8B");
+
 )code"
 #if MSHADOW_INT64_TENSOR_SIZE == 1
 "typedef int64 index_t;\n"
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 93d3e7085148..a45f973a92fa 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -37,7 +37,7 @@
 import random
 from mxnet.test_utils import verify_generator, gen_buckets_probs_with_ppf
 from mxnet.numpy_op_signature import _get_builtin_op
-from mxnet.test_utils import is_op_runnable, has_tvm_ops
+from mxnet.test_utils import is_op_runnable, has_tvm_ops, rand_shape_2d
 from mxnet.operator import get_all_registered_operators
 
 
@@ -10209,6 +10209,7 @@ def hybrid_forward(self, F, a, *args, **kwargs):
                         assert same(mx_out.asnumpy(), np_out)
 
 
+@with_seed()
 @use_np
 def test_npx_stop_gradient():
     class TestStopGradient(HybridBlock):
@@ -10236,3 +10237,81 @@ def hybrid_forward(self, F, a):
                 elif grad_req == 'add':
                     assert_almost_equal(new_grad, old_grad + 1)
 
+
+@with_seed()
+@use_np
+def test_np_elementwise_ops_on_misaligned_input():
+    a = np.array([1,2,3,4], dtype='float16')
+    b = np.array([1,2,3,4], dtype='float16')
+
+    c = a[1:3]
+    d = b[1:3]
+    # Note: testing just elemwise_add since all elemwise_ops
+    #       share the implementation
+    c[:] = c + d
+    mx.nd.waitall()
+
+    a = np.array([1,2,3,4], dtype='float16')
+    b = np.array([1,2,3,4], dtype='float16')
+
+    c = a[0:3]
+    d = b[0:3]
+    c[:] = c + d
+    mx.nd.waitall()
+    assert a[3] == 4.0
+
+
+@with_seed()
+@use_np
+@pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64'])
+@pytest.mark.parametrize('lead_dim', [2, 3, 4, 6, 10])
+@pytest.mark.parametrize('both_ways', [False, True])
+def test_np_broadcast_ops_on_misaligned_input(dtype, lead_dim, both_ways):
+    shape = list(rand_shape_2d()) + [lead_dim]
+    small_shape = [shape[0], 1, lead_dim]
+    if both_ways:
+        # Broadcast in both ways [1, K, L] x [M, 1, L]
+        big_shape = [1, shape[1], lead_dim]
+    else:
+        big_shape = shape
+    size = _np.product(shape)
+    small_size = _np.product(small_shape)
+    big_size = _np.product(big_shape)
+    a = np.arange(5000)
+    b = np.arange(5000)
+    e = np.arange(5000)
+    c = a[1:big_size + 1].reshape(tuple(big_shape))
+    d = b[1:small_size + 1].reshape(tuple(small_shape))
+    f = e[1:size + 1].reshape(tuple(shape))
+    f[:] = c + d
+    expected = c.asnumpy() + d.asnumpy()
+    mx.nd.waitall()
+    assert_almost_equal(f, expected)
+
+
+@with_seed()
+@use_np
+@pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64'])
+@pytest.mark.parametrize('lead_dim', [2, 3, 4, 6, 10])
+@pytest.mark.parametrize('both_ways', [False, True])
+def test_np_broadcast_ops_on_misaligned_input_oneside(dtype, lead_dim, both_ways):
+    shape = list(rand_shape_2d()) + [lead_dim]
+    small_shape = [shape[0], shape[1], 1]
+    if both_ways:
+        # Broadcast in both ways [1, K, L] x [M, 1, 1]
+        big_shape = [1, shape[1], lead_dim]
+    else:
+        big_shape = shape
+    size = _np.product(shape)
+    small_size = _np.product(small_shape)
+    big_size = _np.product(big_shape)
+    a = np.arange(5000)
+    b = np.arange(5000)
+    e = np.arange(5000)
+    c = a[1:big_size + 1].reshape(tuple(big_shape))
+    d = b[1:small_size + 1].reshape(tuple(small_shape))
+    f = e[1:size + 1].reshape(tuple(shape))
+    f[:] = c + d
+    expected = c.asnumpy() + d.asnumpy()
+    mx.nd.waitall()
+    assert_almost_equal(f, expected)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 4711508a45c0..6732336ff60c 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9359,60 +9359,54 @@ def test_elementwise_ops_on_misaligned_input():
     assert a[3].asscalar() == 4.0
 
 @with_seed()
-def test_broadcast_ops_on_misaligned_input():
-    dtypes = ['float16', 'float32', 'float64']
-    lead_dims = [2,3,4,6,10]
-
-    for dtype in dtypes:
-        for lead_dim in lead_dims:
-            for both_ways in [False, True]:
-                shape = list(rand_shape_2d()) + [lead_dim]
-                small_shape = [shape[0], 1, lead_dim]
-                if both_ways:
-                    # Broadcast in both ways [1, K, L] x [M, 1, L]
-                    big_shape = [1, shape[1], lead_dim]
-                else:
-                    big_shape = shape
-                size = np.product(shape)
-                small_size = np.product(small_shape)
-                big_size = np.product(big_shape)
-                a = mx.nd.arange(5000)
-                b = mx.nd.arange(5000)
-                e = mx.nd.arange(5000)
-                c = a[1:big_size + 1].reshape(big_shape)
-                d = b[1:small_size + 1].reshape(small_shape)
-                f = e[1:size + 1].reshape(shape)
-                mx.nd.broadcast_add(c, d, out=f)
-                expected = c.asnumpy() + d.asnumpy()
-                mx.nd.waitall()
-                assert_almost_equal(f, expected)
-
-@with_seed()
-def test_broadcast_ops_on_misaligned_input_oneside():
-    dtypes = ['float16', 'float32', 'float64']
-    lead_dims = [2,3,4,6,10]
-
-    for dtype in dtypes:
-        for lead_dim in lead_dims:
-            for both_ways in [False, True]:
-                shape = list(rand_shape_2d()) + [lead_dim]
-                small_shape = [shape[0], shape[1], 1]
-                if both_ways:
-                    # Broadcast in both ways [1, K, L] x [M, 1, 1]
-                    big_shape = [1, shape[1], lead_dim]
-                else:
-                    big_shape = shape
-                size = np.product(shape)
-                small_size = np.product(small_shape)
-                big_size = np.product(big_shape)
-                a = mx.nd.arange(5000)
-                b = mx.nd.arange(5000)
-                e = mx.nd.arange(5000)
-                c = a[1:big_size + 1].reshape(big_shape)
-                d = b[1:small_size + 1].reshape(small_shape)
-                f = e[1:size + 1].reshape(shape)
-                mx.nd.broadcast_add(c, d, out=f)
-                expected = c.asnumpy() + d.asnumpy()
-                mx.nd.waitall()
-                assert_almost_equal(f, expected)
+@pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64'])
+@pytest.mark.parametrize('lead_dim', [2, 3, 4, 6, 10])
+@pytest.mark.parametrize('both_ways', [False, True])
+def test_broadcast_ops_on_misaligned_input(dtype, lead_dim, both_ways):
+    shape = list(rand_shape_2d()) + [lead_dim]
+    small_shape = [shape[0], 1, lead_dim]
+    if both_ways:
+        # Broadcast in both ways [1, K, L] x [M, 1, L]
+        big_shape = [1, shape[1], lead_dim]
+    else:
+        big_shape = shape
+    size = np.product(shape)
+    small_size = np.product(small_shape)
+    big_size = np.product(big_shape)
+    a = mx.nd.arange(5000)
+    b = mx.nd.arange(5000)
+    e = mx.nd.arange(5000)
+    c = a[1:big_size + 1].reshape(big_shape)
+    d = b[1:small_size + 1].reshape(small_shape)
+    f = e[1:size + 1].reshape(shape)
+    mx.nd.broadcast_add(c, d, out=f)
+    expected = c.asnumpy() + d.asnumpy()
+    mx.nd.waitall()
+    assert_almost_equal(f, expected)
+
+@with_seed()
+@pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64'])
+@pytest.mark.parametrize('lead_dim', [2, 3, 4, 6, 10])
+@pytest.mark.parametrize('both_ways', [False, True])
+def test_broadcast_ops_on_misaligned_input_oneside(dtype, lead_dim, both_ways):
+    shape = list(rand_shape_2d()) + [lead_dim]
+    small_shape = [shape[0], shape[1], 1]
+    if both_ways:
+        # Broadcast in both ways [1, K, L] x [M, 1, 1]
+        big_shape = [1, shape[1], lead_dim]
+    else:
+        big_shape = shape
+    size = np.product(shape)
+    small_size = np.product(small_shape)
+    big_size = np.product(big_shape)
+    a = mx.nd.arange(5000)
+    b = mx.nd.arange(5000)
+    e = mx.nd.arange(5000)
+    c = a[1:big_size + 1].reshape(big_shape)
+    d = b[1:small_size + 1].reshape(small_shape)
+    f = e[1:size + 1].reshape(shape)
+    mx.nd.broadcast_add(c, d, out=f)
+    expected = c.asnumpy() + d.asnumpy()
+    mx.nd.waitall()
+    assert_almost_equal(f, expected)
 

From d6b20836f76d60ad3c7814a9fe8a9dcc7ca9ebcb Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 17 Aug 2020 19:42:12 -0700
Subject: [PATCH 64/64] Change env var doc and undo the change to toctree

---
 docs/python_docs/python/tutorials/extend/index.rst | 6 +++---
 docs/static_site/src/pages/api/faq/env_var.md      | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/python_docs/python/tutorials/extend/index.rst b/docs/python_docs/python/tutorials/extend/index.rst
index e3183a96e2aa..d516b52d4dd6 100644
--- a/docs/python_docs/python/tutorials/extend/index.rst
+++ b/docs/python_docs/python/tutorials/extend/index.rst
@@ -65,6 +65,6 @@ The following tutorials will help you learn how to customize MXNet.
    :glob:
 
    *
-   New Operator Creation </api/faq/new_op>
-   New Operator in MXNet Backend </api/faq/add_op_in_backend>
-   Using RTC for CUDA kernels </api/faq/using_rtc>
+   New Operator Creation <https://mxnet.apache.org/api/faq/new_op>
+   New Operator in MXNet Backend <https://mxnet.apache.org/api/faq/add_op_in_backend>
+   Using RTC for CUDA kernels <https://mxnet.apache.org/api/faq/using_rtc>
diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index 55e5f38ffa59..5a2d1bc2e70d 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -390,10 +390,10 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   - It works in Symbolic execution as well as in Gluon models hybridized with ```static_alloc=True``` option.
   - Only applies to MXNet that has been compiled with CUDA (```pip install mxnet-cuXX``` or built from source with ```USE_CUDA=1```) and running on GPU.
 
-* MXNET_FUSION_VERBOSE
+* MXNET_RTC_VERBOSE
   - Values: 0(false) or 1(true) ```(default=0)```
-  - Only applies to MXNet that has been compiled with CUDA and when ```MXNET_USE_FUSION``` option is enabled.
-  - If this variable is set, MXNet will print the code for fused operators that it generated.
+  - Only applies to MXNet that has been compiled with CUDA.
+  - If this variable is set, MXNet will print the code for operators compiled at runtime.
 
 * MXNET_ELIMINATE_COMMON_EXPR
   - Values: 0(false) or 1(true) ```(default=1)```